From 2b57be80483665b2f55e40fe479ef7600bfa2a02 Mon Sep 17 00:00:00 2001 From: Yong-Shin Jiang Date: Sun, 14 Jun 2026 22:51:21 -0700 Subject: [PATCH 1/8] Add unified parse_json runtime type checker (#3285) Introduce zarr.core.json_parse.parse_json, a single type-annotation-driven validator that consolidates the scattered per-field parse_* helpers. Handles primitives, Literal, unions/Optional, fixed and variadic tuples, Sequence/list (coerced to tuple), Mapping/dict, and TypedDict, with a bool-vs-int safe primitive check. Adds tests/test_json_parse.py (94 tests) and a changelog fragment. No existing call sites migrated yet; this is the proof-of-direction module. Co-Authored-By: Claude Opus 4.8 (1M context) --- changes/3285.feature.md | 1 + src/zarr/core/json_parse.py | 271 ++++++++++++++++++++ tests/test_json_parse.py | 476 ++++++++++++++++++++++++++++++++++++ 3 files changed, 748 insertions(+) create mode 100644 changes/3285.feature.md create mode 100644 src/zarr/core/json_parse.py create mode 100644 tests/test_json_parse.py diff --git a/changes/3285.feature.md b/changes/3285.feature.md new file mode 100644 index 0000000000..498f1d38e7 --- /dev/null +++ b/changes/3285.feature.md @@ -0,0 +1 @@ +Add `zarr.core.json_parse.parse_json`, a unified runtime type checker that validates JSON-decoded metadata values against a type annotation and returns data assignable to it (coercing sequences to tuples), or raises a clear error. It handles the JSON-relevant type categories used by array metadata: primitives (`None`, `str`, `int`, `float`, `bool`), `Literal`, unions/`Optional`, fixed and variadic `tuple`, `Sequence`/`list`, `Mapping`/`dict`, and `TypedDict`. This begins de-duplicating the scattered per-field `parse_*` helpers into a single shared validation path (see #3285). diff --git a/src/zarr/core/json_parse.py b/src/zarr/core/json_parse.py new file mode 100644 index 0000000000..9e22121bca --- /dev/null +++ b/src/zarr/core/json_parse.py @@ -0,0 +1,271 @@ +"""Unified runtime type checking for JSON-decoded metadata. + +This module provides a single entry point, :func:`parse_json`, that validates a +JSON-decoded ``value`` against a Python ``type_annotation`` and returns data +assignable to that annotation (coercing where sensible, e.g. a list is coerced +to a ``tuple`` for ``Sequence[T]`` annotations), or raises a useful exception. + +It is intended to consolidate the many hand-written ``parse_*`` helpers spread +across the codebase (see issue #3285). The scope is deliberately limited to the +JSON-shaped types that appear in Zarr metadata: + +* primitives: ``None``, ``str``, ``int``, ``float``, ``bool`` +* ``Literal[...]`` +* unions, including ``Optional[T]`` and ``X | Y`` +* ``tuple[...]`` (fixed-length and variadic) +* ``Sequence[T]`` / ``list[T]`` (coerced to ``tuple``) +* ``Mapping[str, T]`` / ``dict[str, T]`` +* :class:`~typing.TypedDict` + +Anything outside this scope raises a :class:`TypeError`. +""" + +from __future__ import annotations + +import types +from collections.abc import Mapping, Sequence +from typing import Any, Literal, Union, get_args, get_origin + +from typing_extensions import get_type_hints, is_typeddict + +__all__ = ["parse_json"] + +# Primitive types handled by a plain ``isinstance`` check. ``bool`` deliberately +# comes before ``int`` because ``bool`` is a subclass of ``int`` and the two +# must not be confused (see ``_parse_primitive``). +_PRIMITIVES: tuple[type, ...] = (bool, int, float, str) + + +def parse_json(value: object, type_annotation: object) -> Any: + """Validate ``value`` against ``type_annotation`` and return assignable data. + + Parameters + ---------- + value : object + A JSON-decoded value (``str``, ``int``, ``float``, ``bool``, ``None``, + ``list``/``Sequence`` or ``dict``/``Mapping``). + type_annotation : object + The expected type. One of the categories listed in the module + docstring. + + Returns + ------- + object + ``value``, possibly coerced (e.g. a list coerced to a ``tuple`` for a + ``Sequence[T]`` annotation, or a TypedDict-shaped ``dict``). + + Raises + ------ + ValueError + If ``value`` does not satisfy a primitive, literal, or union + annotation. + TypeError + If ``value`` has the wrong container shape, or if + ``type_annotation`` is not within the supported scope. + """ + # 1. None / type(None) + if type_annotation is None or type_annotation is type(None): + if value is None: + return None + raise ValueError(f"Expected None, got {value!r} instead.") + + origin = get_origin(type_annotation) + + # 3. Literal[...] + if origin is Literal: + return _parse_literal(value, type_annotation) + + # 4. Union / Optional / types.UnionType (X | Y) + if origin is Union or origin is types.UnionType: + return _parse_union(value, type_annotation) + + # 8. TypedDict + if is_typeddict(type_annotation): + return _parse_typeddict(value, type_annotation) + + # 5. tuple[...] + if origin is tuple: + return _parse_tuple(value, type_annotation) + + # 7. Mapping[str, T] / dict[str, T] + if origin is not None and isinstance(origin, type) and issubclass(origin, Mapping): + return _parse_mapping(value, type_annotation) + + # 6. Sequence[T] / list[T] (after tuple/Mapping so those take precedence) + if origin is not None and isinstance(origin, type) and issubclass(origin, Sequence): + return _parse_sequence(value, type_annotation) + + # 2. Primitives (str, int, float, bool) + if isinstance(type_annotation, type) and issubclass(type_annotation, _PRIMITIVES): + return _parse_primitive(value, type_annotation) + + # 9. Fallback + raise TypeError( + f"Cannot parse value {value!r} against unsupported type annotation " + f"{type_annotation!r}." + ) + + +def _parse_primitive(value: object, type_annotation: type) -> Any: + """Validate ``value`` against a primitive type via ``isinstance``. + + The critical edge case is that ``bool`` is a subclass of ``int``. When an + ``int`` is expected a ``bool`` must be rejected, and when a ``bool`` is + expected an ``int`` must be rejected. + """ + if type_annotation is bool: + if isinstance(value, bool): + return value + raise ValueError(f"Expected bool, got {value!r} instead.") + + if type_annotation is int: + # Reject bool, which is an ``int`` subclass. + if isinstance(value, int) and not isinstance(value, bool): + return value + raise ValueError(f"Expected int, got {value!r} instead.") + + if type_annotation is float: + # Reject bool, which is an ``int`` (and thus a ``float``-compatible) value. + if isinstance(value, float) and not isinstance(value, bool): + return value + raise ValueError(f"Expected float, got {value!r} instead.") + + # str + if isinstance(value, type_annotation): + return value + raise ValueError(f"Expected {type_annotation.__name__}, got {value!r} instead.") + + +def _parse_literal(value: object, type_annotation: object) -> Any: + """Validate that ``value`` is one of the literal members.""" + choices = get_args(type_annotation) + # ``True == 1`` in Python, so guard against a bool being accepted for an int + # literal (and vice versa) by also comparing types for the matched member. + for choice in choices: + if value == choice and type(value) is type(choice): + return value + raise ValueError(f"Expected one of {choices!r}, got {value!r} instead.") + + +def _parse_union(value: object, type_annotation: object) -> Any: + """Try each union member; return the first that parses, else aggregate.""" + members = get_args(type_annotation) + errors: list[str] = [] + for member in members: + try: + return parse_json(value, member) + except (ValueError, TypeError) as exc: # noqa: PERF203 + errors.append(f" - against {member!r}: {exc}") + joined = "\n".join(errors) + raise ValueError( + f"Expected a value matching one of {members!r}, got {value!r} instead. " + f"Tried each union member:\n{joined}" + ) + + +def _parse_tuple(value: object, type_annotation: object) -> tuple[Any, ...]: + """Validate a fixed-length or variadic ``tuple[...]`` annotation. + + ``tuple[int, str]`` is fixed-length; ``tuple[int, ...]`` is variadic. Each + element is recursively parsed against its corresponding element type. + """ + if not isinstance(value, Sequence) or isinstance(value, str | bytes): + raise TypeError(f"Expected a sequence, got {value!r} instead.") + + args = get_args(type_annotation) + + # Bare ``tuple`` with no parameters: accept any sequence of elements. + if not args: + return tuple(value) + + # Variadic ``tuple[T, ...]``. + if len(args) == 2 and args[1] is Ellipsis: + element_type = args[0] + return tuple(parse_json(item, element_type) for item in value) + + # Special-case the empty tuple ``tuple[()]``. + if args == ((),): + args = () + + # Fixed-length ``tuple[T1, T2, ...]``. + if len(value) != len(args): + raise TypeError( + f"Expected a sequence of length {len(args)}, got {value!r} of " + f"length {len(value)} instead." + ) + return tuple(parse_json(item, element_type) for item, element_type in zip(value, args, strict=True)) + + +def _parse_sequence(value: object, type_annotation: object) -> tuple[Any, ...]: + """Validate ``Sequence[T]`` / ``list[T]``, returning a ``tuple``. + + Each element is recursively parsed against ``T``. A ``str`` is *not* a valid + sequence here, even though it is technically a ``Sequence[str]``, because in + JSON terms a string is a primitive, not an array. + """ + if not isinstance(value, Sequence) or isinstance(value, str | bytes): + raise TypeError(f"Expected a sequence, got {value!r} instead.") + + args = get_args(type_annotation) + if not args: + # Bare ``list`` / ``Sequence`` with no element type: accept any element. + return tuple(value) + element_type = args[0] + return tuple(parse_json(item, element_type) for item in value) + + +def _parse_mapping(value: object, type_annotation: object) -> dict[str, Any]: + """Validate ``Mapping[str, T]`` / ``dict[str, T]``, returning a ``dict``. + + Keys must be ``str`` and each value is recursively parsed against ``T``. + """ + if not isinstance(value, Mapping): + raise TypeError(f"Expected a mapping, got {value!r} instead.") + + args = get_args(type_annotation) + if args: + key_type, value_type = args[0], args[1] + else: + key_type, value_type = str, object + + result: dict[str, Any] = {} + for key, val in value.items(): + if key_type is str and not isinstance(key, str): + raise TypeError(f"Expected mapping key to be str, got {key!r} instead.") + result[key] = parse_json(val, value_type) if value_type is not object else val + return result + + +def _parse_typeddict(value: object, type_annotation: Any) -> dict[str, Any]: + """Validate a :class:`~typing.TypedDict` annotation. + + Each required key must be present and is parsed against its annotation. + Optional keys are parsed when present. Totality is respected via the + TypedDict's ``__required_keys__`` / ``__optional_keys__`` attributes. + """ + if not isinstance(value, Mapping): + raise TypeError(f"Expected a mapping, got {value!r} instead.") + + # ``include_extras=True`` keeps ``Required`` / ``NotRequired`` / ``ReadOnly`` + # wrappers visible if needed; the required/optional split is taken from the + # TypedDict's own bookkeeping which already accounts for totality. + hints = get_type_hints(type_annotation, include_extras=False) + required_keys: frozenset[str] = getattr(type_annotation, "__required_keys__", frozenset()) + optional_keys: frozenset[str] = getattr(type_annotation, "__optional_keys__", frozenset()) + + missing = [key for key in required_keys if key not in value] + if missing: + raise ValueError( + f"Expected required key(s) {sorted(missing)!r} for " + f"{type_annotation.__name__}, got {value!r} instead." + ) + + result: dict[str, Any] = {} + for key in required_keys | optional_keys: + if key in value: + result[key] = parse_json(value[key], hints[key]) + # Preserve any extra keys not declared on the TypedDict. + for key, val in value.items(): + if key not in result: + result[key] = val + return result diff --git a/tests/test_json_parse.py b/tests/test_json_parse.py new file mode 100644 index 0000000000..87f9f587c4 --- /dev/null +++ b/tests/test_json_parse.py @@ -0,0 +1,476 @@ +"""Tests for :func:`zarr.core.json_parse.parse_json`. + +These cover every dispatch category described in section 4.3 of the SDD: +primitives (incl. the bool/int edge case), ``Literal``, unions / ``Optional``, +fixed-length and variadic ``tuple``, ``Sequence`` / ``list`` (with coercion to +``tuple``), ``Mapping`` / ``dict``, ``TypedDict`` (required/optional/nested), +and the fallback ``TypeError`` for unsupported annotations. +""" + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from typing import TYPE_CHECKING, Literal, Optional, Union + +import pytest + +# ``parse_json`` (the module under test) uses ``typing_extensions`` for its +# TypedDict bookkeeping, so we build TypedDicts the same way for consistency. +from typing_extensions import NotRequired, TypedDict + +from zarr.core.json_parse import parse_json + +if TYPE_CHECKING: + from typing import Any + + +# --------------------------------------------------------------------------- +# None / type(None) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("annotation", [None, type(None)]) +def test_none_valid(annotation: Any) -> None: + assert parse_json(None, annotation) is None + + +@pytest.mark.parametrize("annotation", [None, type(None)]) +@pytest.mark.parametrize("value", [0, "", False, [], {}]) +def test_none_invalid(annotation: Any, value: Any) -> None: + with pytest.raises(ValueError, match="Expected None"): + parse_json(value, annotation) + + +# --------------------------------------------------------------------------- +# Primitives: str, int, float, bool +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("value", "annotation"), + [ + ("hello", str), + ("", str), + (1, int), + (-5, int), + (1.5, float), + (-0.0, float), + (True, bool), + (False, bool), + ], +) +def test_primitive_valid(value: Any, annotation: type) -> None: + result = parse_json(value, annotation) + assert result == value + # The exact object is returned unchanged for primitives. + assert result is value + + +@pytest.mark.parametrize( + ("value", "annotation"), + [ + (1, str), + (1.0, str), + (True, str), + ("1", int), + (1.0, int), + ("x", float), + (1, float), # an int is NOT a float here + ("true", bool), + (0, bool), + ], +) +def test_primitive_wrong_type_raises(value: Any, annotation: type) -> None: + with pytest.raises(ValueError, match="Expected"): + parse_json(value, annotation) + + +# --- The CRITICAL bool/int edge cases ------------------------------------- + + +def test_bool_not_accepted_as_int() -> None: + """``True`` (an ``int`` subclass instance) must NOT satisfy ``int``.""" + with pytest.raises(ValueError, match="Expected int"): + parse_json(True, int) + with pytest.raises(ValueError, match="Expected int"): + parse_json(False, int) + + +def test_int_not_accepted_as_bool() -> None: + """A plain ``int`` must NOT satisfy ``bool``.""" + with pytest.raises(ValueError, match="Expected bool"): + parse_json(1, bool) + with pytest.raises(ValueError, match="Expected bool"): + parse_json(0, bool) + + +def test_bool_accepted_as_bool() -> None: + assert parse_json(True, bool) is True + assert parse_json(False, bool) is False + + +def test_int_accepted_as_int() -> None: + assert parse_json(1, int) == 1 + assert parse_json(0, int) == 0 + + +def test_bool_not_accepted_as_float() -> None: + """``bool`` is an ``int`` subclass and must not satisfy ``float`` either.""" + with pytest.raises(ValueError, match="Expected float"): + parse_json(True, float) + + +# --------------------------------------------------------------------------- +# Literal[...] +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("value", ["C", "F"]) +def test_literal_str_member_valid(value: str) -> None: + assert parse_json(value, Literal["C", "F"]) == value + + +@pytest.mark.parametrize("value", ["Q", "c", "", 0]) +def test_literal_str_non_member_raises(value: Any) -> None: + with pytest.raises(ValueError, match="Expected one of"): + parse_json(value, Literal["C", "F"]) + + +@pytest.mark.parametrize("value", [1, 2]) +def test_literal_int_member_valid(value: int) -> None: + assert parse_json(value, Literal[1, 2]) == value + + +def test_literal_int_non_member_raises() -> None: + with pytest.raises(ValueError, match="Expected one of"): + parse_json(3, Literal[1, 2]) + + +def test_literal_true_does_not_match_int_literal() -> None: + """``True == 1`` in Python, but ``True`` must NOT satisfy ``Literal[1, 2]``.""" + with pytest.raises(ValueError, match="Expected one of"): + parse_json(True, Literal[1, 2]) + + +def test_literal_one_does_not_match_bool_literal() -> None: + """Conversely, ``1`` must not satisfy a bool literal.""" + with pytest.raises(ValueError, match="Expected one of"): + parse_json(1, Literal[True, False]) + + +def test_literal_mixed_members() -> None: + annotation = Literal["a", 3, None] + assert parse_json("a", annotation) == "a" + assert parse_json(3, annotation) == 3 + # ``None`` is a valid literal member. + assert parse_json(None, annotation) is None + + +# --------------------------------------------------------------------------- +# Union / Optional +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("value", ["hello", 5]) +def test_union_each_member_accepted(value: Any) -> None: + assert parse_json(value, Union[str, int]) == value + + +def test_union_pep604_syntax() -> None: + """The ``X | Y`` syntax (``types.UnionType``) is handled too.""" + assert parse_json("hello", str | int) == "hello" + assert parse_json(5, str | int) == 5 + + +def test_union_no_member_matches_raises() -> None: + with pytest.raises(ValueError, match="Expected a value matching one of"): + parse_json(1.5, Union[str, int]) + + +def test_union_error_lists_each_member() -> None: + with pytest.raises(ValueError, match="Tried each union member"): + parse_json([], Union[str, int]) + + +@pytest.mark.parametrize("value", [None, "x"]) +def test_optional_accepts_none_and_value(value: Any) -> None: + assert parse_json(value, Optional[str]) == value + + +def test_optional_rejects_other_types() -> None: + with pytest.raises(ValueError, match="Expected a value matching one of"): + parse_json(5, Optional[str]) + + +def test_union_with_coercion_returns_coerced_value() -> None: + """A union member that coerces (Sequence -> tuple) returns the coerced form.""" + result = parse_json([1, 2, 3], Union[str, Sequence[int]]) + assert result == (1, 2, 3) + assert isinstance(result, tuple) + + +# --------------------------------------------------------------------------- +# tuple[...] +# --------------------------------------------------------------------------- + + +def test_tuple_fixed_length_valid() -> None: + result = parse_json([1, "a"], tuple[int, str]) + assert result == (1, "a") + assert isinstance(result, tuple) + + +def test_tuple_fixed_length_wrong_length_raises() -> None: + with pytest.raises(TypeError, match="Expected a sequence of length 2"): + parse_json([1], tuple[int, str]) + with pytest.raises(TypeError, match="Expected a sequence of length 2"): + parse_json([1, "a", "extra"], tuple[int, str]) + + +def test_tuple_fixed_length_wrong_element_type_raises() -> None: + # Second element should be str but is an int -> inner primitive ValueError. + with pytest.raises(ValueError, match="Expected str"): + parse_json([1, 2], tuple[int, str]) + + +def test_tuple_variadic_valid() -> None: + result = parse_json([1, 2, 3], tuple[int, ...]) + assert result == (1, 2, 3) + assert isinstance(result, tuple) + + +def test_tuple_variadic_empty() -> None: + assert parse_json([], tuple[int, ...]) == () + + +def test_tuple_variadic_wrong_element_raises() -> None: + with pytest.raises(ValueError, match="Expected int"): + parse_json([1, "x", 3], tuple[int, ...]) + + +def test_tuple_accepts_tuple_input() -> None: + assert parse_json((1, "a"), tuple[int, str]) == (1, "a") + + +def test_tuple_rejects_non_sequence() -> None: + with pytest.raises(TypeError, match="Expected a sequence"): + parse_json(5, tuple[int, ...]) + + +def test_tuple_rejects_str_input() -> None: + """A ``str`` is not treated as a sequence for tuple parsing.""" + with pytest.raises(TypeError, match="Expected a sequence"): + parse_json("ab", tuple[str, str]) + + +def test_tuple_empty_annotation() -> None: + """``tuple[()]`` matches the empty sequence.""" + assert parse_json([], tuple[()]) == () + + +# --------------------------------------------------------------------------- +# Sequence[T] / list[T] (coerced to tuple) +# --------------------------------------------------------------------------- + + +def test_sequence_coerces_to_tuple() -> None: + result = parse_json([1, 2, 3], Sequence[int]) + assert result == (1, 2, 3) + assert isinstance(result, tuple) + + +def test_list_coerces_to_tuple() -> None: + result = parse_json([1, 2, 3], list[int]) + assert result == (1, 2, 3) + assert isinstance(result, tuple) + + +def test_sequence_validates_elements() -> None: + with pytest.raises(ValueError, match="Expected int"): + parse_json([1, "two", 3], Sequence[int]) + + +def test_sequence_rejects_str() -> None: + """A bare string is a primitive in JSON terms, not a sequence.""" + with pytest.raises(TypeError, match="Expected a sequence"): + parse_json("abc", Sequence[str]) + + +def test_sequence_rejects_non_sequence() -> None: + with pytest.raises(TypeError, match="Expected a sequence"): + parse_json(42, Sequence[int]) + + +def test_sequence_of_sequence_nested() -> None: + result = parse_json([[1, 2], [3]], Sequence[Sequence[int]]) + assert result == ((1, 2), (3,)) + assert isinstance(result, tuple) + assert all(isinstance(inner, tuple) for inner in result) + + +# --------------------------------------------------------------------------- +# Mapping[str, T] / dict[str, T] +# --------------------------------------------------------------------------- + + +def test_mapping_valid_returns_dict() -> None: + result = parse_json({"a": 1, "b": 2}, Mapping[str, int]) + assert result == {"a": 1, "b": 2} + assert isinstance(result, dict) + + +def test_dict_valid_returns_dict() -> None: + result = parse_json({"a": 1}, dict[str, int]) + assert result == {"a": 1} + assert isinstance(result, dict) + + +def test_mapping_wrong_value_type_raises() -> None: + with pytest.raises(ValueError, match="Expected int"): + parse_json({"a": "not an int"}, Mapping[str, int]) + + +def test_mapping_non_str_key_raises() -> None: + with pytest.raises(TypeError, match="Expected mapping key to be str"): + parse_json({1: 1}, Mapping[str, int]) + + +def test_mapping_non_mapping_raises() -> None: + with pytest.raises(TypeError, match="Expected a mapping"): + parse_json([("a", 1)], Mapping[str, int]) + + +def test_mapping_nested_values() -> None: + result = parse_json({"a": [1, 2], "b": [3]}, Mapping[str, Sequence[int]]) + assert result == {"a": (1, 2), "b": (3,)} + assert isinstance(result, dict) + assert all(isinstance(v, tuple) for v in result.values()) + + +# --------------------------------------------------------------------------- +# TypedDict +# --------------------------------------------------------------------------- + + +class Point(TypedDict): + x: int + y: int + + +# NOTE: This module uses ``from __future__ import annotations``, which stringizes +# annotations. With the *class* TypedDict syntax, ``typing_extensions`` cannot see +# a ``NotRequired[...]`` wrapper at class-creation time (the annotation is a string), +# so the key is wrongly recorded as required. This is a known limitation of +# stringized annotations + ``NotRequired`` -- NOT a bug in ``json_parse``. The +# functional ``TypedDict(...)`` form keeps ``NotRequired`` as a real runtime object, +# so ``__optional_keys__`` is populated correctly; we use it here to genuinely +# exercise NotRequired. (``total=False``, used by ``PartialConfig`` below, is +# class-level and works correctly even with stringized annotations.) +Config = TypedDict("Config", {"name": str, "count": NotRequired[int]}) + + +class PartialConfig(TypedDict, total=False): + a: int + b: str + + +class Outer(TypedDict): + label: str + point: Point + + +def test_typeddict_valid() -> None: + result = parse_json({"x": 1, "y": 2}, Point) + assert result == {"x": 1, "y": 2} + assert isinstance(result, dict) + + +def test_typeddict_missing_required_key_raises() -> None: + with pytest.raises(ValueError, match="Expected required key"): + parse_json({"x": 1}, Point) + + +def test_typeddict_wrong_value_type_raises() -> None: + with pytest.raises(ValueError, match="Expected int"): + parse_json({"x": 1, "y": "two"}, Point) + + +def test_typeddict_non_mapping_raises() -> None: + with pytest.raises(TypeError, match="Expected a mapping"): + parse_json([1, 2], Point) + + +def test_typeddict_notrequired_present() -> None: + result = parse_json({"name": "a", "count": 3}, Config) + assert result == {"name": "a", "count": 3} + + +def test_typeddict_notrequired_absent() -> None: + result = parse_json({"name": "a"}, Config) + assert result == {"name": "a"} + + +def test_typeddict_notrequired_wrong_type_raises() -> None: + with pytest.raises(ValueError, match="Expected int"): + parse_json({"name": "a", "count": "x"}, Config) + + +def test_typeddict_total_false_all_optional() -> None: + # All keys optional: empty dict is valid. + assert parse_json({}, PartialConfig) == {} + assert parse_json({"a": 1}, PartialConfig) == {"a": 1} + assert parse_json({"a": 1, "b": "y"}, PartialConfig) == {"a": 1, "b": "y"} + + +def test_typeddict_total_false_validates_present_keys() -> None: + with pytest.raises(ValueError, match="Expected int"): + parse_json({"a": "not int"}, PartialConfig) + + +def test_typeddict_nested_valid() -> None: + result = parse_json({"label": "L", "point": {"x": 1, "y": 2}}, Outer) + assert result == {"label": "L", "point": {"x": 1, "y": 2}} + + +def test_typeddict_nested_invalid_recurses() -> None: + with pytest.raises(ValueError, match="Expected int"): + parse_json({"label": "L", "point": {"x": 1, "y": "bad"}}, Outer) + + +def test_typeddict_nested_missing_required_recurses() -> None: + with pytest.raises(ValueError, match="Expected required key"): + parse_json({"label": "L", "point": {"x": 1}}, Outer) + + +def test_typeddict_preserves_extra_keys() -> None: + """Extra keys not declared on the TypedDict are preserved unchanged.""" + result = parse_json({"x": 1, "y": 2, "extra": "kept"}, Point) + assert result == {"x": 1, "y": 2, "extra": "kept"} + + +# --------------------------------------------------------------------------- +# Fallback / error quality +# --------------------------------------------------------------------------- + + +def test_unsupported_annotation_raises_typeerror() -> None: + with pytest.raises(TypeError, match="unsupported type annotation"): + parse_json(1, complex) + + +def test_unsupported_annotation_names_value_and_annotation() -> None: + with pytest.raises(TypeError) as excinfo: + parse_json(object(), set[int]) + msg = str(excinfo.value) + assert "set" in msg + + +def test_error_message_names_offending_value() -> None: + """Primitive errors name the offending value via ``repr``.""" + with pytest.raises(ValueError, match=r"Expected int, got 'nope' instead"): + parse_json("nope", int) + + +def test_error_message_names_expected_literal_choices() -> None: + with pytest.raises(ValueError, match=r"Expected one of \('C', 'F'\)"): + parse_json("Q", Literal["C", "F"]) From 0eba162e1e5c8377b10bac10509f8f464aaffc18 Mon Sep 17 00:00:00 2001 From: Yong-Shin Jiang Date: Sun, 14 Jun 2026 22:54:10 -0700 Subject: [PATCH 2/8] Migrate parse_order, parse_bool, parse_zarr_format to parse_json (#3285) Pilot migration delegating three representative helpers to the unified parse_json validator. Public signatures and return types are unchanged. parse_zarr_format re-wraps parse_json's ValueError/TypeError as MetadataValidationError with the original message to preserve observable behavior. parse_json is imported function-locally to avoid circular imports. Focused suites green: test_common/test_config/test_metadata/test_json_parse = 430 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/zarr/core/common.py | 12 ++++++------ src/zarr/core/metadata/v3.py | 11 +++++++---- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index eafffa1818..b1b4403f1c 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -206,15 +206,15 @@ def parse_fill_value(data: Any) -> Any: def parse_order(data: Any) -> Literal["C", "F"]: - if data in ("C", "F"): - return cast("Literal['C', 'F']", data) - raise ValueError(f"Expected one of ('C', 'F'), got {data} instead.") + from zarr.core.json_parse import parse_json + + return cast("Literal['C', 'F']", parse_json(data, Literal["C", "F"])) def parse_bool(data: Any) -> bool: - if isinstance(data, bool): - return data - raise ValueError(f"Expected bool, got {data} instead.") + from zarr.core.json_parse import parse_json + + return cast("bool", parse_json(data, bool)) def _warn_write_empty_chunks_kwarg() -> None: diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 9eaccc5076..b27ff05160 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -47,10 +47,13 @@ def parse_zarr_format(data: object) -> Literal[3]: - if data == 3: - return 3 - msg = f"Invalid value for 'zarr_format'. Expected '3'. Got '{data}'." - raise MetadataValidationError(msg) + from zarr.core.json_parse import parse_json + + try: + return cast("Literal[3]", parse_json(data, Literal[3])) + except (ValueError, TypeError) as exc: + msg = f"Invalid value for 'zarr_format'. Expected '3'. Got '{data}'." + raise MetadataValidationError(msg) from exc def parse_node_type_array(data: object) -> Literal["array"]: From df1a0cc14818f13d5c075e3bb035a552911d65a6 Mon Sep 17 00:00:00 2001 From: Yong-Shin Jiang Date: Sun, 14 Jun 2026 23:23:50 -0700 Subject: [PATCH 3/8] Fix lint and make TypedDict NotRequired robust under future annotations (#3285) Apply ruff check/format to satisfy the Lint CI hook. Keep typing.Union/Optional spellings in tests (with noqa) to cover that origin path alongside X | Y. Rework _parse_typeddict to derive required/optional from get_type_hints(include_extras=True) + __total__ instead of __required_keys__, so class-syntax NotRequired is detected even when 'from __future__ import annotations' stringizes the hints (as in zarr's metadata modules). test_json_parse + test_common + test_metadata = 393 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/zarr/core/json_parse.py | 50 +++++++++++++++++++++++++------------ tests/test_json_parse.py | 39 ++++++++++++++++------------- 2 files changed, 55 insertions(+), 34 deletions(-) diff --git a/src/zarr/core/json_parse.py b/src/zarr/core/json_parse.py index 9e22121bca..8608e69d11 100644 --- a/src/zarr/core/json_parse.py +++ b/src/zarr/core/json_parse.py @@ -24,7 +24,7 @@ import types from collections.abc import Mapping, Sequence -from typing import Any, Literal, Union, get_args, get_origin +from typing import Any, Literal, NotRequired, Required, Union, get_args, get_origin from typing_extensions import get_type_hints, is_typeddict @@ -101,8 +101,7 @@ def parse_json(value: object, type_annotation: object) -> Any: # 9. Fallback raise TypeError( - f"Cannot parse value {value!r} against unsupported type annotation " - f"{type_annotation!r}." + f"Cannot parse value {value!r} against unsupported type annotation {type_annotation!r}." ) @@ -154,7 +153,7 @@ def _parse_union(value: object, type_annotation: object) -> Any: for member in members: try: return parse_json(value, member) - except (ValueError, TypeError) as exc: # noqa: PERF203 + except (ValueError, TypeError) as exc: errors.append(f" - against {member!r}: {exc}") joined = "\n".join(errors) raise ValueError( @@ -193,7 +192,9 @@ def _parse_tuple(value: object, type_annotation: object) -> tuple[Any, ...]: f"Expected a sequence of length {len(args)}, got {value!r} of " f"length {len(value)} instead." ) - return tuple(parse_json(item, element_type) for item, element_type in zip(value, args, strict=True)) + return tuple( + parse_json(item, element_type) for item, element_type in zip(value, args, strict=True) + ) def _parse_sequence(value: object, type_annotation: object) -> tuple[Any, ...]: @@ -239,19 +240,36 @@ def _parse_mapping(value: object, type_annotation: object) -> dict[str, Any]: def _parse_typeddict(value: object, type_annotation: Any) -> dict[str, Any]: """Validate a :class:`~typing.TypedDict` annotation. - Each required key must be present and is parsed against its annotation. - Optional keys are parsed when present. Totality is respected via the - TypedDict's ``__required_keys__`` / ``__optional_keys__`` attributes. + Each required key must be present and is parsed against its annotation; + optional keys are parsed when present. Whether a key is required is derived + from the *resolved* annotations -- their ``Required`` / ``NotRequired`` + wrappers combined with the TypedDict's totality (``__total__``). + + This deliberately does not use ``__required_keys__`` / ``__optional_keys__``: + under ``from __future__ import annotations`` those are computed from + stringized annotations at class-creation time, so a ``NotRequired[...]`` + wrapper is invisible to them. Resolving the hints with + ``include_extras=True`` evaluates the strings and makes the wrappers visible. """ if not isinstance(value, Mapping): raise TypeError(f"Expected a mapping, got {value!r} instead.") - # ``include_extras=True`` keeps ``Required`` / ``NotRequired`` / ``ReadOnly`` - # wrappers visible if needed; the required/optional split is taken from the - # TypedDict's own bookkeeping which already accounts for totality. - hints = get_type_hints(type_annotation, include_extras=False) - required_keys: frozenset[str] = getattr(type_annotation, "__required_keys__", frozenset()) - optional_keys: frozenset[str] = getattr(type_annotation, "__optional_keys__", frozenset()) + hints = get_type_hints(type_annotation, include_extras=True) + total = getattr(type_annotation, "__total__", True) + + required_keys: set[str] = set() + field_types: dict[str, Any] = {} + for key, hint in hints.items(): + origin = get_origin(hint) + if origin is Required: + required_keys.add(key) + field_types[key] = get_args(hint)[0] + elif origin is NotRequired: + field_types[key] = get_args(hint)[0] + else: + if total: + required_keys.add(key) + field_types[key] = hint missing = [key for key in required_keys if key not in value] if missing: @@ -261,9 +279,9 @@ def _parse_typeddict(value: object, type_annotation: Any) -> dict[str, Any]: ) result: dict[str, Any] = {} - for key in required_keys | optional_keys: + for key, field_type in field_types.items(): if key in value: - result[key] = parse_json(value[key], hints[key]) + result[key] = parse_json(value[key], field_type) # Preserve any extra keys not declared on the TypedDict. for key, val in value.items(): if key not in result: diff --git a/tests/test_json_parse.py b/tests/test_json_parse.py index 87f9f587c4..cf87dad914 100644 --- a/tests/test_json_parse.py +++ b/tests/test_json_parse.py @@ -10,13 +10,13 @@ from __future__ import annotations from collections.abc import Mapping, Sequence -from typing import TYPE_CHECKING, Literal, Optional, Union +from typing import TYPE_CHECKING, Literal, NotRequired, Optional, Union import pytest # ``parse_json`` (the module under test) uses ``typing_extensions`` for its # TypedDict bookkeeping, so we build TypedDicts the same way for consistency. -from typing_extensions import NotRequired, TypedDict +from typing_extensions import TypedDict from zarr.core.json_parse import parse_json @@ -159,7 +159,7 @@ def test_literal_one_does_not_match_bool_literal() -> None: def test_literal_mixed_members() -> None: - annotation = Literal["a", 3, None] + annotation = Literal["a", 3] | None assert parse_json("a", annotation) == "a" assert parse_json(3, annotation) == 3 # ``None`` is a valid literal member. @@ -169,11 +169,15 @@ def test_literal_mixed_members() -> None: # --------------------------------------------------------------------------- # Union / Optional # --------------------------------------------------------------------------- +# These tests deliberately use the ``typing.Union`` / ``typing.Optional`` +# spelling (which yields a ``typing.Union`` origin) to cover that code path; +# the ``X | Y`` spelling (``types.UnionType``) is covered separately below. +# The legacy-spelling lines therefore suppress ruff's UP007/UP045 rules. @pytest.mark.parametrize("value", ["hello", 5]) def test_union_each_member_accepted(value: Any) -> None: - assert parse_json(value, Union[str, int]) == value + assert parse_json(value, Union[str, int]) == value # noqa: UP007 def test_union_pep604_syntax() -> None: @@ -184,27 +188,27 @@ def test_union_pep604_syntax() -> None: def test_union_no_member_matches_raises() -> None: with pytest.raises(ValueError, match="Expected a value matching one of"): - parse_json(1.5, Union[str, int]) + parse_json(1.5, Union[str, int]) # noqa: UP007 def test_union_error_lists_each_member() -> None: with pytest.raises(ValueError, match="Tried each union member"): - parse_json([], Union[str, int]) + parse_json([], Union[str, int]) # noqa: UP007 @pytest.mark.parametrize("value", [None, "x"]) def test_optional_accepts_none_and_value(value: Any) -> None: - assert parse_json(value, Optional[str]) == value + assert parse_json(value, Optional[str]) == value # noqa: UP045 def test_optional_rejects_other_types() -> None: with pytest.raises(ValueError, match="Expected a value matching one of"): - parse_json(5, Optional[str]) + parse_json(5, Optional[str]) # noqa: UP045 def test_union_with_coercion_returns_coerced_value() -> None: """A union member that coerces (Sequence -> tuple) returns the coerced form.""" - result = parse_json([1, 2, 3], Union[str, Sequence[int]]) + result = parse_json([1, 2, 3], Union[str, Sequence[int]]) # noqa: UP007 assert result == (1, 2, 3) assert isinstance(result, tuple) @@ -358,15 +362,14 @@ class Point(TypedDict): # NOTE: This module uses ``from __future__ import annotations``, which stringizes -# annotations. With the *class* TypedDict syntax, ``typing_extensions`` cannot see -# a ``NotRequired[...]`` wrapper at class-creation time (the annotation is a string), -# so the key is wrongly recorded as required. This is a known limitation of -# stringized annotations + ``NotRequired`` -- NOT a bug in ``json_parse``. The -# functional ``TypedDict(...)`` form keeps ``NotRequired`` as a real runtime object, -# so ``__optional_keys__`` is populated correctly; we use it here to genuinely -# exercise NotRequired. (``total=False``, used by ``PartialConfig`` below, is -# class-level and works correctly even with stringized annotations.) -Config = TypedDict("Config", {"name": str, "count": NotRequired[int]}) +# annotations -- so a class-syntax ``NotRequired[...]`` is invisible to the +# TypedDict's ``__optional_keys__``. ``_parse_typeddict`` handles this by +# resolving the hints with ``get_type_hints(..., include_extras=True)`` and +# reading the ``Required`` / ``NotRequired`` wrappers directly, so the class +# form below genuinely exercises NotRequired under stringized annotations. +class Config(TypedDict): + name: str + count: NotRequired[int] class PartialConfig(TypedDict, total=False): From 78874b3b6057476f64ac808a5d06f064f53c3285 Mon Sep 17 00:00:00 2001 From: Yong-Shin Jiang Date: Sun, 14 Jun 2026 23:32:49 -0700 Subject: [PATCH 4/8] Annotate origin as Any to satisfy mypy in _parse_typeddict (#3285) get_origin(hint) is Required/NotRequired tripped mypy's comparison-overlap and unreachable checks; typing origin as Any keeps the runtime check while satisfying mypy. Full 'uv run --frozen mypy' is clean (190 files); ruff check/format clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/zarr/core/json_parse.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/json_parse.py b/src/zarr/core/json_parse.py index 8608e69d11..0ce63173c4 100644 --- a/src/zarr/core/json_parse.py +++ b/src/zarr/core/json_parse.py @@ -260,7 +260,9 @@ def _parse_typeddict(value: object, type_annotation: Any) -> dict[str, Any]: required_keys: set[str] = set() field_types: dict[str, Any] = {} for key, hint in hints.items(): - origin = get_origin(hint) + # Annotated as ``Any`` so mypy does not narrow the ``is`` comparisons + # against the ``Required`` / ``NotRequired`` special forms. + origin: Any = get_origin(hint) if origin is Required: required_keys.add(key) field_types[key] = get_args(hint)[0] From 730274a29e8d7325eaf5cd87247cfcbcf56d0275 Mon Sep 17 00:00:00 2001 From: Yong-Shin Jiang Date: Sun, 21 Jun 2026 17:14:54 -0700 Subject: [PATCH 5/8] Migrate Batch 1 literal parsers to parse_json (#3285) Delegate the literal/type check of parse_indexing_order, parse_node_type, parse_node_type_array, parse_separator, two parse_zarr_format variants (group, v2), and parse_name to the unified parse_json. Public signatures and return types unchanged; each preserves its original exception type and message (wrapping parse_json's ValueError/TypeError into MetadataValidationError / NodeTypeValidationError / the original ValueError/TypeError where tests assert on them). parse_json imported function-locally to avoid circular imports. Focused suites + full mypy green (1196 passed). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/zarr/core/chunk_key_encodings.py | 9 ++++++--- src/zarr/core/common.py | 15 +++++++++------ src/zarr/core/config.py | 7 +++---- src/zarr/core/group.py | 22 ++++++++++++++-------- src/zarr/core/metadata/v2.py | 11 ++++++++--- src/zarr/core/metadata/v3.py | 11 +++++++---- 6 files changed, 47 insertions(+), 28 deletions(-) diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index 098f2c8981..d4fb560255 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -19,9 +19,12 @@ def parse_separator(data: JSON) -> SeparatorLiteral: - if data not in (".", "/"): - raise ValueError(f"Expected an '.' or '/' separator. Got {data} instead.") - return cast("SeparatorLiteral", data) + from zarr.core.json_parse import parse_json + + try: + return cast("SeparatorLiteral", parse_json(data, Literal[".", "/"])) + except (ValueError, TypeError) as exc: + raise ValueError(f"Expected an '.' or '/' separator. Got {data} instead.") from exc class ChunkKeyEncodingParams(TypedDict): diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index b1b4403f1c..7b346008a0 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -126,12 +126,15 @@ def parse_enum[E: Enum](data: object, cls: type[E]) -> E: def parse_name(data: JSON, expected: str | None = None) -> str: - if isinstance(data, str): - if expected is None or data == expected: - return data - raise ValueError(f"Expected '{expected}'. Got {data} instead.") - else: - raise TypeError(f"Expected a string, got an instance of {type(data)}.") + from zarr.core.json_parse import parse_json + + try: + data = cast("str", parse_json(data, str)) + except (ValueError, TypeError) as exc: + raise TypeError(f"Expected a string, got an instance of {type(data)}.") from exc + if expected is None or data == expected: + return data + raise ValueError(f"Expected '{expected}'. Got {data} instead.") def parse_configuration(data: JSON) -> JSON: diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 7dcbc78e31..cd6490dc60 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -149,7 +149,6 @@ def enable_gpu(self) -> ConfigSet: def parse_indexing_order(data: Any) -> Literal["C", "F"]: - if data in ("C", "F"): - return cast("Literal['C', 'F']", data) - msg = f"Expected one of ('C', 'F'), got {data} instead." - raise ValueError(msg) + from zarr.core.json_parse import parse_json + + return cast("Literal['C', 'F']", parse_json(data, Literal["C", "F"])) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 52eaa3e144..43df2b70fe 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -85,18 +85,24 @@ def parse_zarr_format(data: Any) -> ZarrFormat: """Parse the zarr_format field from metadata.""" - if data in (2, 3): - return cast("ZarrFormat", data) - msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}." - raise ValueError(msg) + from zarr.core.json_parse import parse_json + + try: + return cast("ZarrFormat", parse_json(data, Literal[2, 3])) + except (ValueError, TypeError) as exc: + msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}." + raise ValueError(msg) from exc def parse_node_type(data: Any) -> NodeType: """Parse the node_type field from metadata.""" - if data in ("array", "group"): - return cast("Literal['array', 'group']", data) - msg = f"Invalid value for 'node_type'. Expected 'array' or 'group'. Got '{data}'." - raise MetadataValidationError(msg) + from zarr.core.json_parse import parse_json + + try: + return cast("Literal['array', 'group']", parse_json(data, Literal["array", "group"])) + except (ValueError, TypeError) as exc: + msg = f"Invalid value for 'node_type'. Expected 'array' or 'group'. Got '{data}'." + raise MetadataValidationError(msg) from exc # todo: convert None to empty dict diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index ac32521239..1704c29a02 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -278,9 +278,14 @@ def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]: def parse_zarr_format(data: object) -> Literal[2]: - if data == 2: - return 2 - raise ValueError(f"Invalid value. Expected 2. Got {data}.") + from typing import Literal + + from zarr.core.json_parse import parse_json + + try: + return cast("Literal[2]", parse_json(data, Literal[2])) + except (ValueError, TypeError) as exc: + raise ValueError(f"Invalid value. Expected 2. Got {data}.") from exc def parse_filters(data: object) -> tuple[Numcodec, ...] | None: diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index b27ff05160..31f365d46a 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -57,10 +57,13 @@ def parse_zarr_format(data: object) -> Literal[3]: def parse_node_type_array(data: object) -> Literal["array"]: - if data == "array": - return "array" - msg = f"Invalid value for 'node_type'. Expected 'array'. Got '{data}'." - raise NodeTypeValidationError(msg) + from zarr.core.json_parse import parse_json + + try: + return cast('Literal["array"]', parse_json(data, Literal["array"])) + except (ValueError, TypeError) as exc: + msg = f"Invalid value for 'node_type'. Expected 'array'. Got '{data}'." + raise NodeTypeValidationError(msg) from exc def parse_codecs(data: object) -> tuple[Codec, ...]: From 6bde3f4700fcac7c1ece9c42e934a8a86ab58cab Mon Sep 17 00:00:00 2001 From: Yong-Shin Jiang Date: Sun, 21 Jun 2026 17:22:01 -0700 Subject: [PATCH 6/8] Migrate Batch 2 codec primitive parsers to parse_json (#3285) Delegate the int/bool type check of parse_checksum, parse_clevel, parse_blocksize, parse_typesize, parse_gzip_level, and parse_zstd_level to parse_json, keeping each helper's range/bound check and exact error messages. Original exception types are preserved by wrapping parse_json's ValueError/TypeError. Note: parse_json rejects bool where the old isinstance(data, int) accepted it; verified no caller/test passes a bool to these, so this is a deliberate, more-correct strictening. parse_json imported function-locally. Codec suite + full mypy green (794 passed). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/zarr/codecs/blosc.py | 40 ++++++++++++++++++++++++++-------------- src/zarr/codecs/gzip.py | 14 +++++++++----- src/zarr/codecs/zstd.py | 24 ++++++++++++++++-------- 3 files changed, 51 insertions(+), 27 deletions(-) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 087de716fc..2bd00b4931 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -104,27 +104,39 @@ class BloscCname(metaclass=_DeprecatedStrEnumMeta): def parse_typesize(data: JSON) -> int: - if isinstance(data, int): - if data > 0: - return data - else: - raise ValueError( - f"Value must be greater than 0. Got {data}, which is less or equal to 0." - ) - raise TypeError(f"Value must be an int. Got {type(data)} instead.") + from zarr.core.json_parse import parse_json + + try: + parsed: int = parse_json(data, int) + except (ValueError, TypeError) as exc: + raise TypeError(f"Value must be an int. Got {type(data)} instead.") from exc + if parsed > 0: + return parsed + else: + raise ValueError( + f"Value must be greater than 0. Got {parsed}, which is less or equal to 0." + ) # todo: real validation def parse_clevel(data: JSON) -> int: - if isinstance(data, int): - return data - raise TypeError(f"Value should be an int. Got {type(data)} instead.") + from zarr.core.json_parse import parse_json + + try: + parsed: int = parse_json(data, int) + except (ValueError, TypeError) as exc: + raise TypeError(f"Value should be an int. Got {type(data)} instead.") from exc + return parsed def parse_blocksize(data: JSON) -> int: - if isinstance(data, int): - return data - raise TypeError(f"Value should be an int. Got {type(data)} instead.") + from zarr.core.json_parse import parse_json + + try: + parsed: int = parse_json(data, int) + except (ValueError, TypeError) as exc: + raise TypeError(f"Value should be an int. Got {type(data)} instead.") from exc + return parsed def _parse_cname(data: object) -> BloscCnameLiteral: diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index b8591748f7..60a2f7ef34 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -19,13 +19,17 @@ def parse_gzip_level(data: JSON) -> int: - if not isinstance(data, (int)): - raise TypeError(f"Expected int, got {type(data)}") - if data not in range(10): + from zarr.core.json_parse import parse_json + + try: + parsed: int = parse_json(data, int) + except (ValueError, TypeError) as exc: + raise TypeError(f"Expected int, got {type(data)}") from exc + if parsed not in range(10): raise ValueError( - f"Expected an integer from the inclusive range (0, 9). Got {data} instead." + f"Expected an integer from the inclusive range (0, 9). Got {parsed} instead." ) - return data + return parsed @dataclass(frozen=True) diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index f93c25a3c7..38aca9d5d9 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -21,17 +21,25 @@ def parse_zstd_level(data: JSON) -> int: - if isinstance(data, int): - if data >= 23: - raise ValueError(f"Value must be less than or equal to 22. Got {data} instead.") - return data - raise TypeError(f"Got value with type {type(data)}, but expected an int.") + from zarr.core.json_parse import parse_json + + try: + parsed: int = parse_json(data, int) + except (ValueError, TypeError) as exc: + raise TypeError(f"Got value with type {type(data)}, but expected an int.") from exc + if parsed >= 23: + raise ValueError(f"Value must be less than or equal to 22. Got {parsed} instead.") + return parsed def parse_checksum(data: JSON) -> bool: - if isinstance(data, bool): - return data - raise TypeError(f"Expected bool. Got {type(data)}.") + from zarr.core.json_parse import parse_json + + try: + parsed: bool = parse_json(data, bool) + except (ValueError, TypeError) as exc: + raise TypeError(f"Expected bool. Got {type(data)}.") from exc + return parsed @dataclass(frozen=True) From e29d0a0570d2bdb0aae87a3868bd323ae198c801 Mon Sep 17 00:00:00 2001 From: Yong-Shin Jiang Date: Mon, 29 Jun 2026 00:15:43 -0700 Subject: [PATCH 7/8] Replace hand-written parse_json with msgspec.convert (#3285) Delete the bespoke parse_json runtime type checker and route JSON metadata validation through msgspec.convert, which handles the type coercions zarr needs (Literal membership, int/bool strictness, list-to-tuple). A small hand-written fallback (validate_json_value) covers the recursive JSON values msgspec cannot build a schema for, and adds an explicit nesting-depth limit. The registry/dtype/numcodec parsers stay hand-written since they need runtime lookups. Also fixes a latent generator-exhaustion bug in parse_storage_transformers, adds msgspec as a dependency (pinned in the min_deps env), and preserves the exception types and messages every migrated helper raised. Net ~555 lines removed. Tests, mypy and ruff all green. Co-Authored-By: Claude Opus 4.8 (1M context) --- changes/3285.feature.md | 7 +- pyproject.toml | 2 + src/zarr/codecs/blosc.py | 12 +- src/zarr/codecs/gzip.py | 4 +- src/zarr/codecs/zstd.py | 8 +- src/zarr/core/chunk_key_encodings.py | 4 +- src/zarr/core/common.py | 18 +- src/zarr/core/config.py | 7 +- src/zarr/core/group.py | 8 +- src/zarr/core/json_parse.py | 323 +++------------ src/zarr/core/metadata/v2.py | 4 +- src/zarr/core/metadata/v3.py | 23 +- tests/test_common.py | 20 + tests/test_json_parse.py | 567 +++++---------------------- 14 files changed, 226 insertions(+), 781 deletions(-) diff --git a/changes/3285.feature.md b/changes/3285.feature.md index 498f1d38e7..82232f8610 100644 --- a/changes/3285.feature.md +++ b/changes/3285.feature.md @@ -1 +1,6 @@ -Add `zarr.core.json_parse.parse_json`, a unified runtime type checker that validates JSON-decoded metadata values against a type annotation and returns data assignable to it (coercing sequences to tuples), or raises a clear error. It handles the JSON-relevant type categories used by array metadata: primitives (`None`, `str`, `int`, `float`, `bool`), `Literal`, unions/`Optional`, fixed and variadic `tuple`, `Sequence`/`list`, `Mapping`/`dict`, and `TypedDict`. This begins de-duplicating the scattered per-field `parse_*` helpers into a single shared validation path (see #3285). +JSON metadata validation now delegates to ``msgspec.convert`` for the type +coercions it supports (``Literal`` membership, ``int`` / ``bool`` strictness, +list-to-tuple), replacing the per-field hand-written ``parse_*`` logic. A small +fallback validates the recursive JSON values msgspec cannot, now with an +explicit nesting-depth limit, and a latent generator-exhaustion bug in +``parse_storage_transformers`` is fixed. See #3285. diff --git a/pyproject.toml b/pyproject.toml index 02e66c67e8..6cfb7d52cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ 'google-crc32c>=1.5', 'typing_extensions>=4.14', 'donfig>=0.8', + 'msgspec>=0.19', ] dynamic = [ @@ -271,6 +272,7 @@ extra-dependencies = [ 'typing_extensions==4.14.*', 'donfig==0.8.*', 'obstore==0.5.*', + 'msgspec==0.19.*', ] [tool.hatch.envs.default] diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 2bd00b4931..2a891087f0 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -104,10 +104,10 @@ class BloscCname(metaclass=_DeprecatedStrEnumMeta): def parse_typesize(data: JSON) -> int: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - parsed: int = parse_json(data, int) + parsed: int = convert(data, int) except (ValueError, TypeError) as exc: raise TypeError(f"Value must be an int. Got {type(data)} instead.") from exc if parsed > 0: @@ -120,20 +120,20 @@ def parse_typesize(data: JSON) -> int: # todo: real validation def parse_clevel(data: JSON) -> int: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - parsed: int = parse_json(data, int) + parsed: int = convert(data, int) except (ValueError, TypeError) as exc: raise TypeError(f"Value should be an int. Got {type(data)} instead.") from exc return parsed def parse_blocksize(data: JSON) -> int: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - parsed: int = parse_json(data, int) + parsed: int = convert(data, int) except (ValueError, TypeError) as exc: raise TypeError(f"Value should be an int. Got {type(data)} instead.") from exc return parsed diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index 60a2f7ef34..8d0e9ca9ed 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -19,10 +19,10 @@ def parse_gzip_level(data: JSON) -> int: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - parsed: int = parse_json(data, int) + parsed: int = convert(data, int) except (ValueError, TypeError) as exc: raise TypeError(f"Expected int, got {type(data)}") from exc if parsed not in range(10): diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index 38aca9d5d9..824e277915 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -21,10 +21,10 @@ def parse_zstd_level(data: JSON) -> int: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - parsed: int = parse_json(data, int) + parsed: int = convert(data, int) except (ValueError, TypeError) as exc: raise TypeError(f"Got value with type {type(data)}, but expected an int.") from exc if parsed >= 23: @@ -33,10 +33,10 @@ def parse_zstd_level(data: JSON) -> int: def parse_checksum(data: JSON) -> bool: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - parsed: bool = parse_json(data, bool) + parsed: bool = convert(data, bool) except (ValueError, TypeError) as exc: raise TypeError(f"Expected bool. Got {type(data)}.") from exc return parsed diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index d4fb560255..20a2fe988d 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -19,10 +19,10 @@ def parse_separator(data: JSON) -> SeparatorLiteral: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - return cast("SeparatorLiteral", parse_json(data, Literal[".", "/"])) + return cast("SeparatorLiteral", convert(data, Literal[".", "/"])) except (ValueError, TypeError) as exc: raise ValueError(f"Expected an '.' or '/' separator. Got {data} instead.") from exc diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 71e991ee67..6e0096c42a 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -124,10 +124,10 @@ def parse_enum[E: Enum](data: object, cls: type[E]) -> E: def parse_name(data: JSON, expected: str | None = None) -> str: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - data = cast("str", parse_json(data, str)) + data = cast("str", convert(data, str)) except (ValueError, TypeError) as exc: raise TypeError(f"Expected a string, got an instance of {type(data)}.") from exc if expected is None or data == expected: @@ -207,15 +207,21 @@ def parse_fill_value(data: Any) -> Any: def parse_order(data: Any) -> Literal["C", "F"]: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert - return cast("Literal['C', 'F']", parse_json(data, Literal["C", "F"])) + try: + return cast("Literal['C', 'F']", convert(data, Literal["C", "F"])) + except TypeError as exc: + raise ValueError(f"Expected one of ('C', 'F'), got {data!r} instead.") from exc def parse_bool(data: Any) -> bool: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert - return cast("bool", parse_json(data, bool)) + try: + return cast("bool", convert(data, bool)) + except TypeError as exc: + raise ValueError(f"Expected bool, got {data!r} instead.") from exc def parse_int(data: Any) -> int: diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 98a7f12321..0c818795b5 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -151,6 +151,9 @@ def enable_gpu(self) -> ConfigSet: def parse_indexing_order(data: Any) -> Literal["C", "F"]: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert - return cast("Literal['C', 'F']", parse_json(data, Literal["C", "F"])) + try: + return cast("Literal['C', 'F']", convert(data, Literal["C", "F"])) + except TypeError as exc: + raise ValueError(f"Expected one of ('C', 'F'), got {data!r} instead.") from exc diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 43df2b70fe..3fce939ae6 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -85,10 +85,10 @@ def parse_zarr_format(data: Any) -> ZarrFormat: """Parse the zarr_format field from metadata.""" - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - return cast("ZarrFormat", parse_json(data, Literal[2, 3])) + return cast("ZarrFormat", convert(data, Literal[2, 3])) except (ValueError, TypeError) as exc: msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}." raise ValueError(msg) from exc @@ -96,10 +96,10 @@ def parse_zarr_format(data: Any) -> ZarrFormat: def parse_node_type(data: Any) -> NodeType: """Parse the node_type field from metadata.""" - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - return cast("Literal['array', 'group']", parse_json(data, Literal["array", "group"])) + return cast("Literal['array', 'group']", convert(data, Literal["array", "group"])) except (ValueError, TypeError) as exc: msg = f"Invalid value for 'node_type'. Expected 'array' or 'group'. Got '{data}'." raise MetadataValidationError(msg) from exc diff --git a/src/zarr/core/json_parse.py b/src/zarr/core/json_parse.py index 0ce63173c4..9218425cb5 100644 --- a/src/zarr/core/json_parse.py +++ b/src/zarr/core/json_parse.py @@ -1,291 +1,72 @@ -"""Unified runtime type checking for JSON-decoded metadata. +"""Helpers for validating JSON-decoded metadata. -This module provides a single entry point, :func:`parse_json`, that validates a -JSON-decoded ``value`` against a Python ``type_annotation`` and returns data -assignable to that annotation (coercing where sensible, e.g. a list is coerced -to a ``tuple`` for ``Sequence[T]`` annotations), or raises a useful exception. +Most JSON metadata validation is delegated to :func:`msgspec.convert`, which +handles the type coercions Zarr needs (``Literal`` membership, ``int``/``bool`` +strictness, list-to-tuple, ``TypedDict`` with ``NotRequired``). :func:`convert` +is a thin wrapper that translates :class:`msgspec.ValidationError` into the +``TypeError`` the rest of the codebase already raises. -It is intended to consolidate the many hand-written ``parse_*`` helpers spread -across the codebase (see issue #3285). The scope is deliberately limited to the -JSON-shaped types that appear in Zarr metadata: +msgspec cannot handle two things in Zarr's metadata types: -* primitives: ``None``, ``str``, ``int``, ``float``, ``bool`` -* ``Literal[...]`` -* unions, including ``Optional[T]`` and ``X | Y`` -* ``tuple[...]`` (fixed-length and variadic) -* ``Sequence[T]`` / ``list[T]`` (coerced to ``tuple``) -* ``Mapping[str, T]`` / ``dict[str, T]`` -* :class:`~typing.TypedDict` +* the recursive ``JSON`` / ``JSONValue`` aliases, which it rejects at + schema-build time, and +* PEP 728 ``extra_items=`` extension fields, which it silently drops. -Anything outside this scope raises a :class:`TypeError`. +:func:`validate_json_value` is the small hand-written fallback for the first of +those. See https://github.com/zarr-developers/zarr-python/issues/3285. """ from __future__ import annotations -import types -from collections.abc import Mapping, Sequence -from typing import Any, Literal, NotRequired, Required, Union, get_args, get_origin +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Final, cast -from typing_extensions import get_type_hints, is_typeddict +import msgspec -__all__ = ["parse_json"] +if TYPE_CHECKING: + from zarr.core.common import JSON -# Primitive types handled by a plain ``isinstance`` check. ``bool`` deliberately -# comes before ``int`` because ``bool`` is a subclass of ``int`` and the two -# must not be confused (see ``_parse_primitive``). -_PRIMITIVES: tuple[type, ...] = (bool, int, float, str) +__all__ = ["MAX_JSON_DEPTH", "convert", "validate_json_value"] +MAX_JSON_DEPTH: Final = 64 +"""Maximum nesting depth accepted by :func:`validate_json_value`.""" -def parse_json(value: object, type_annotation: object) -> Any: - """Validate ``value`` against ``type_annotation`` and return assignable data. - Parameters - ---------- - value : object - A JSON-decoded value (``str``, ``int``, ``float``, ``bool``, ``None``, - ``list``/``Sequence`` or ``dict``/``Mapping``). - type_annotation : object - The expected type. One of the categories listed in the module - docstring. +def convert(value: object, type_: Any, *, strict: bool = True) -> Any: + """Validate and coerce ``value`` against ``type_`` via :func:`msgspec.convert`. - Returns - ------- - object - ``value``, possibly coerced (e.g. a list coerced to a ``tuple`` for a - ``Sequence[T]`` annotation, or a TypedDict-shaped ``dict``). - - Raises - ------ - ValueError - If ``value`` does not satisfy a primitive, literal, or union - annotation. - TypeError - If ``value`` has the wrong container shape, or if - ``type_annotation`` is not within the supported scope. - """ - # 1. None / type(None) - if type_annotation is None or type_annotation is type(None): - if value is None: - return None - raise ValueError(f"Expected None, got {value!r} instead.") - - origin = get_origin(type_annotation) - - # 3. Literal[...] - if origin is Literal: - return _parse_literal(value, type_annotation) - - # 4. Union / Optional / types.UnionType (X | Y) - if origin is Union or origin is types.UnionType: - return _parse_union(value, type_annotation) - - # 8. TypedDict - if is_typeddict(type_annotation): - return _parse_typeddict(value, type_annotation) - - # 5. tuple[...] - if origin is tuple: - return _parse_tuple(value, type_annotation) - - # 7. Mapping[str, T] / dict[str, T] - if origin is not None and isinstance(origin, type) and issubclass(origin, Mapping): - return _parse_mapping(value, type_annotation) - - # 6. Sequence[T] / list[T] (after tuple/Mapping so those take precedence) - if origin is not None and isinstance(origin, type) and issubclass(origin, Sequence): - return _parse_sequence(value, type_annotation) - - # 2. Primitives (str, int, float, bool) - if isinstance(type_annotation, type) and issubclass(type_annotation, _PRIMITIVES): - return _parse_primitive(value, type_annotation) - - # 9. Fallback - raise TypeError( - f"Cannot parse value {value!r} against unsupported type annotation {type_annotation!r}." - ) - - -def _parse_primitive(value: object, type_annotation: type) -> Any: - """Validate ``value`` against a primitive type via ``isinstance``. - - The critical edge case is that ``bool`` is a subclass of ``int``. When an - ``int`` is expected a ``bool`` must be rejected, and when a ``bool`` is - expected an ``int`` must be rejected. - """ - if type_annotation is bool: - if isinstance(value, bool): - return value - raise ValueError(f"Expected bool, got {value!r} instead.") - - if type_annotation is int: - # Reject bool, which is an ``int`` subclass. - if isinstance(value, int) and not isinstance(value, bool): - return value - raise ValueError(f"Expected int, got {value!r} instead.") - - if type_annotation is float: - # Reject bool, which is an ``int`` (and thus a ``float``-compatible) value. - if isinstance(value, float) and not isinstance(value, bool): - return value - raise ValueError(f"Expected float, got {value!r} instead.") - - # str - if isinstance(value, type_annotation): - return value - raise ValueError(f"Expected {type_annotation.__name__}, got {value!r} instead.") - - -def _parse_literal(value: object, type_annotation: object) -> Any: - """Validate that ``value`` is one of the literal members.""" - choices = get_args(type_annotation) - # ``True == 1`` in Python, so guard against a bool being accepted for an int - # literal (and vice versa) by also comparing types for the matched member. - for choice in choices: - if value == choice and type(value) is type(choice): - return value - raise ValueError(f"Expected one of {choices!r}, got {value!r} instead.") - - -def _parse_union(value: object, type_annotation: object) -> Any: - """Try each union member; return the first that parses, else aggregate.""" - members = get_args(type_annotation) - errors: list[str] = [] - for member in members: - try: - return parse_json(value, member) - except (ValueError, TypeError) as exc: - errors.append(f" - against {member!r}: {exc}") - joined = "\n".join(errors) - raise ValueError( - f"Expected a value matching one of {members!r}, got {value!r} instead. " - f"Tried each union member:\n{joined}" - ) - - -def _parse_tuple(value: object, type_annotation: object) -> tuple[Any, ...]: - """Validate a fixed-length or variadic ``tuple[...]`` annotation. - - ``tuple[int, str]`` is fixed-length; ``tuple[int, ...]`` is variadic. Each - element is recursively parsed against its corresponding element type. - """ - if not isinstance(value, Sequence) or isinstance(value, str | bytes): - raise TypeError(f"Expected a sequence, got {value!r} instead.") - - args = get_args(type_annotation) - - # Bare ``tuple`` with no parameters: accept any sequence of elements. - if not args: - return tuple(value) - - # Variadic ``tuple[T, ...]``. - if len(args) == 2 and args[1] is Ellipsis: - element_type = args[0] - return tuple(parse_json(item, element_type) for item in value) - - # Special-case the empty tuple ``tuple[()]``. - if args == ((),): - args = () - - # Fixed-length ``tuple[T1, T2, ...]``. - if len(value) != len(args): - raise TypeError( - f"Expected a sequence of length {len(args)}, got {value!r} of " - f"length {len(value)} instead." - ) - return tuple( - parse_json(item, element_type) for item, element_type in zip(value, args, strict=True) - ) - - -def _parse_sequence(value: object, type_annotation: object) -> tuple[Any, ...]: - """Validate ``Sequence[T]`` / ``list[T]``, returning a ``tuple``. - - Each element is recursively parsed against ``T``. A ``str`` is *not* a valid - sequence here, even though it is technically a ``Sequence[str]``, because in - JSON terms a string is a primitive, not an array. + msgspec handles the JSON-shaped coercions Zarr needs but raises + :class:`msgspec.ValidationError` on a mismatch. We translate that to + ``TypeError`` so callers can keep raising the exception types the rest of + Zarr already expects. """ - if not isinstance(value, Sequence) or isinstance(value, str | bytes): - raise TypeError(f"Expected a sequence, got {value!r} instead.") - - args = get_args(type_annotation) - if not args: - # Bare ``list`` / ``Sequence`` with no element type: accept any element. - return tuple(value) - element_type = args[0] - return tuple(parse_json(item, element_type) for item in value) + try: + return msgspec.convert(value, type_, strict=strict) + except msgspec.ValidationError as exc: + raise TypeError(str(exc)) from exc -def _parse_mapping(value: object, type_annotation: object) -> dict[str, Any]: - """Validate ``Mapping[str, T]`` / ``dict[str, T]``, returning a ``dict``. +def validate_json_value(value: object, *, max_depth: int = MAX_JSON_DEPTH, _depth: int = 0) -> JSON: + """Check that ``value`` is a JSON value and return it unchanged. - Keys must be ``str`` and each value is recursively parsed against ``T``. + msgspec cannot build a schema for Zarr's recursive ``JSON`` / ``JSONValue`` + aliases, so this covers the fields typed that way (``attributes``, + ``fill_value``, extension-field values). Unlike the previous per-field + parsers it also enforces ``max_depth``: a pathologically nested document + could otherwise exhaust the interpreter stack. """ - if not isinstance(value, Mapping): - raise TypeError(f"Expected a mapping, got {value!r} instead.") - - args = get_args(type_annotation) - if args: - key_type, value_type = args[0], args[1] - else: - key_type, value_type = str, object - - result: dict[str, Any] = {} - for key, val in value.items(): - if key_type is str and not isinstance(key, str): - raise TypeError(f"Expected mapping key to be str, got {key!r} instead.") - result[key] = parse_json(val, value_type) if value_type is not object else val - return result - - -def _parse_typeddict(value: object, type_annotation: Any) -> dict[str, Any]: - """Validate a :class:`~typing.TypedDict` annotation. - - Each required key must be present and is parsed against its annotation; - optional keys are parsed when present. Whether a key is required is derived - from the *resolved* annotations -- their ``Required`` / ``NotRequired`` - wrappers combined with the TypedDict's totality (``__total__``). - - This deliberately does not use ``__required_keys__`` / ``__optional_keys__``: - under ``from __future__ import annotations`` those are computed from - stringized annotations at class-creation time, so a ``NotRequired[...]`` - wrapper is invisible to them. Resolving the hints with - ``include_extras=True`` evaluates the strings and makes the wrappers visible. - """ - if not isinstance(value, Mapping): - raise TypeError(f"Expected a mapping, got {value!r} instead.") - - hints = get_type_hints(type_annotation, include_extras=True) - total = getattr(type_annotation, "__total__", True) - - required_keys: set[str] = set() - field_types: dict[str, Any] = {} - for key, hint in hints.items(): - # Annotated as ``Any`` so mypy does not narrow the ``is`` comparisons - # against the ``Required`` / ``NotRequired`` special forms. - origin: Any = get_origin(hint) - if origin is Required: - required_keys.add(key) - field_types[key] = get_args(hint)[0] - elif origin is NotRequired: - field_types[key] = get_args(hint)[0] - else: - if total: - required_keys.add(key) - field_types[key] = hint - - missing = [key for key in required_keys if key not in value] - if missing: - raise ValueError( - f"Expected required key(s) {sorted(missing)!r} for " - f"{type_annotation.__name__}, got {value!r} instead." - ) - - result: dict[str, Any] = {} - for key, field_type in field_types.items(): - if key in value: - result[key] = parse_json(value[key], field_type) - # Preserve any extra keys not declared on the TypedDict. - for key, val in value.items(): - if key not in result: - result[key] = val - return result + if _depth > max_depth: + raise ValueError(f"JSON value nesting exceeds the maximum depth of {max_depth}.") + if value is None or isinstance(value, (bool, int, float, str)): + return cast("JSON", value) + if isinstance(value, (list, tuple)): + for item in value: + validate_json_value(item, max_depth=max_depth, _depth=_depth + 1) + return cast("JSON", value) + if isinstance(value, Mapping): + for key, item in value.items(): + if not isinstance(key, str): + raise TypeError(f"JSON object keys must be str, got {type(key).__name__}.") + validate_json_value(item, max_depth=max_depth, _depth=_depth + 1) + return cast("JSON", value) + raise TypeError(f"Value {value!r} is not a valid JSON value.") diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 1704c29a02..1250643a27 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -280,10 +280,10 @@ def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]: def parse_zarr_format(data: object) -> Literal[2]: from typing import Literal - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - return cast("Literal[2]", parse_json(data, Literal[2])) + return cast("Literal[2]", convert(data, Literal[2])) except (ValueError, TypeError) as exc: raise ValueError(f"Invalid value. Expected 2. Got {data}.") from exc diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 31f365d46a..97d77c1cc7 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -47,20 +47,20 @@ def parse_zarr_format(data: object) -> Literal[3]: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - return cast("Literal[3]", parse_json(data, Literal[3])) + return cast("Literal[3]", convert(data, Literal[3])) except (ValueError, TypeError) as exc: msg = f"Invalid value for 'zarr_format'. Expected '3'. Got '{data}'." raise MetadataValidationError(msg) from exc def parse_node_type_array(data: object) -> Literal["array"]: - from zarr.core.json_parse import parse_json + from zarr.core.json_parse import convert try: - return cast('Literal["array"]', parse_json(data, Literal["array"])) + return cast('Literal["array"]', convert(data, Literal["array"])) except (ValueError, TypeError) as exc: msg = f"Invalid value for 'node_type'. Expected 'array'. Got '{data}'." raise NodeTypeValidationError(msg) from exc @@ -136,11 +136,12 @@ def parse_storage_transformers(data: object) -> tuple[dict[str, JSON], ...]: """ if data is None: return () - if isinstance(data, Iterable): - if len(tuple(data)) >= 1: - return data # type: ignore[return-value] - else: - return () + if isinstance(data, Iterable) and not isinstance(data, (str, bytes)): + # Materialise once. The previous implementation called ``len(tuple(data))`` + # and then returned ``data`` itself, which exhausted (and discarded) a + # one-shot iterable and could return a value typed as a tuple that was not + # actually a tuple. + return tuple(data) raise TypeError( f"Invalid storage_transformers. Expected an iterable of dicts. Got {type(data)} instead." ) @@ -616,6 +617,8 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + from zarr.core.json_parse import validate_json_value + # make a copy because we are modifying the dict _data = data.copy() @@ -662,7 +665,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: chunk_grid=_data_typed["chunk_grid"], # type: ignore[arg-type] chunk_key_encoding=_data_typed["chunk_key_encoding"], # type: ignore[arg-type] codecs=_data_typed["codecs"], - attributes=_data_typed.get("attributes", {}), # type: ignore[arg-type] + attributes=validate_json_value(_data_typed.get("attributes", {})), # type: ignore[arg-type] dimension_names=_data_typed.get("dimension_names", None), fill_value=fill_value_parsed, data_type=data_type, diff --git a/tests/test_common.py b/tests/test_common.py index 2fe0743e14..caac012aa0 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -9,8 +9,10 @@ from zarr.core.common import ( ANY_ACCESS_MODE, AccessModeLiteral, + parse_bool, parse_int, parse_name, + parse_order, parse_shapelike, product, ) @@ -73,6 +75,24 @@ def test_parse_indexing_order_invalid(data: Any) -> None: parse_indexing_order(data) +@pytest.mark.parametrize("data", [0, 1, "hello", "f"]) +def test_parse_order_invalid(data: Any) -> None: + with pytest.raises(ValueError, match="Expected one of"): + parse_order(data) + + +@pytest.mark.parametrize("data", [0, 1, "true", None, [True]]) +def test_parse_bool_invalid(data: Any) -> None: + """Non-bool values are rejected with a ValueError (preserving prior behavior).""" + with pytest.raises(ValueError, match="Expected bool"): + parse_bool(data) + + +@pytest.mark.parametrize("data", [True, False]) +def test_parse_bool_valid(data: bool) -> None: + assert parse_bool(data) is data + + @pytest.mark.parametrize("data", ["1", 1.0, True, False, None, [1], (1,)]) def test_parse_int_invalid(data: Any) -> None: """Non-int values (including bools, which are int subclasses) are rejected.""" diff --git a/tests/test_json_parse.py b/tests/test_json_parse.py index cf87dad914..a905378eeb 100644 --- a/tests/test_json_parse.py +++ b/tests/test_json_parse.py @@ -1,479 +1,104 @@ -"""Tests for :func:`zarr.core.json_parse.parse_json`. +"""Tests for :mod:`zarr.core.json_parse`. -These cover every dispatch category described in section 4.3 of the SDD: -primitives (incl. the bool/int edge case), ``Literal``, unions / ``Optional``, -fixed-length and variadic ``tuple``, ``Sequence`` / ``list`` (with coercion to -``tuple``), ``Mapping`` / ``dict``, ``TypedDict`` (required/optional/nested), -and the fallback ``TypeError`` for unsupported annotations. +``convert`` delegates JSON type coercion to :func:`msgspec.convert` (translating +``msgspec.ValidationError`` into ``TypeError``); ``validate_json_value`` is the +hand-written fallback for the recursive ``JSON`` alias msgspec cannot build, +including a nesting-depth limit. The final group is a regression test for the +``parse_storage_transformers`` fix that motivated the depth limit work. """ from __future__ import annotations -from collections.abc import Mapping, Sequence -from typing import TYPE_CHECKING, Literal, NotRequired, Optional, Union +from typing import Literal import pytest -# ``parse_json`` (the module under test) uses ``typing_extensions`` for its -# TypedDict bookkeeping, so we build TypedDicts the same way for consistency. -from typing_extensions import TypedDict - -from zarr.core.json_parse import parse_json - -if TYPE_CHECKING: - from typing import Any - - -# --------------------------------------------------------------------------- -# None / type(None) -# --------------------------------------------------------------------------- - - -@pytest.mark.parametrize("annotation", [None, type(None)]) -def test_none_valid(annotation: Any) -> None: - assert parse_json(None, annotation) is None - - -@pytest.mark.parametrize("annotation", [None, type(None)]) -@pytest.mark.parametrize("value", [0, "", False, [], {}]) -def test_none_invalid(annotation: Any, value: Any) -> None: - with pytest.raises(ValueError, match="Expected None"): - parse_json(value, annotation) - - -# --------------------------------------------------------------------------- -# Primitives: str, int, float, bool -# --------------------------------------------------------------------------- - - -@pytest.mark.parametrize( - ("value", "annotation"), - [ - ("hello", str), - ("", str), - (1, int), - (-5, int), - (1.5, float), - (-0.0, float), - (True, bool), - (False, bool), - ], -) -def test_primitive_valid(value: Any, annotation: type) -> None: - result = parse_json(value, annotation) - assert result == value - # The exact object is returned unchanged for primitives. - assert result is value - - -@pytest.mark.parametrize( - ("value", "annotation"), - [ - (1, str), - (1.0, str), - (True, str), - ("1", int), - (1.0, int), - ("x", float), - (1, float), # an int is NOT a float here - ("true", bool), - (0, bool), - ], -) -def test_primitive_wrong_type_raises(value: Any, annotation: type) -> None: - with pytest.raises(ValueError, match="Expected"): - parse_json(value, annotation) - - -# --- The CRITICAL bool/int edge cases ------------------------------------- - - -def test_bool_not_accepted_as_int() -> None: - """``True`` (an ``int`` subclass instance) must NOT satisfy ``int``.""" - with pytest.raises(ValueError, match="Expected int"): - parse_json(True, int) - with pytest.raises(ValueError, match="Expected int"): - parse_json(False, int) - - -def test_int_not_accepted_as_bool() -> None: - """A plain ``int`` must NOT satisfy ``bool``.""" - with pytest.raises(ValueError, match="Expected bool"): - parse_json(1, bool) - with pytest.raises(ValueError, match="Expected bool"): - parse_json(0, bool) - - -def test_bool_accepted_as_bool() -> None: - assert parse_json(True, bool) is True - assert parse_json(False, bool) is False - - -def test_int_accepted_as_int() -> None: - assert parse_json(1, int) == 1 - assert parse_json(0, int) == 0 - - -def test_bool_not_accepted_as_float() -> None: - """``bool`` is an ``int`` subclass and must not satisfy ``float`` either.""" - with pytest.raises(ValueError, match="Expected float"): - parse_json(True, float) - - -# --------------------------------------------------------------------------- -# Literal[...] -# --------------------------------------------------------------------------- - - -@pytest.mark.parametrize("value", ["C", "F"]) -def test_literal_str_member_valid(value: str) -> None: - assert parse_json(value, Literal["C", "F"]) == value - - -@pytest.mark.parametrize("value", ["Q", "c", "", 0]) -def test_literal_str_non_member_raises(value: Any) -> None: - with pytest.raises(ValueError, match="Expected one of"): - parse_json(value, Literal["C", "F"]) - - -@pytest.mark.parametrize("value", [1, 2]) -def test_literal_int_member_valid(value: int) -> None: - assert parse_json(value, Literal[1, 2]) == value - - -def test_literal_int_non_member_raises() -> None: - with pytest.raises(ValueError, match="Expected one of"): - parse_json(3, Literal[1, 2]) - - -def test_literal_true_does_not_match_int_literal() -> None: - """``True == 1`` in Python, but ``True`` must NOT satisfy ``Literal[1, 2]``.""" - with pytest.raises(ValueError, match="Expected one of"): - parse_json(True, Literal[1, 2]) - - -def test_literal_one_does_not_match_bool_literal() -> None: - """Conversely, ``1`` must not satisfy a bool literal.""" - with pytest.raises(ValueError, match="Expected one of"): - parse_json(1, Literal[True, False]) - - -def test_literal_mixed_members() -> None: - annotation = Literal["a", 3] | None - assert parse_json("a", annotation) == "a" - assert parse_json(3, annotation) == 3 - # ``None`` is a valid literal member. - assert parse_json(None, annotation) is None - - -# --------------------------------------------------------------------------- -# Union / Optional -# --------------------------------------------------------------------------- -# These tests deliberately use the ``typing.Union`` / ``typing.Optional`` -# spelling (which yields a ``typing.Union`` origin) to cover that code path; -# the ``X | Y`` spelling (``types.UnionType``) is covered separately below. -# The legacy-spelling lines therefore suppress ruff's UP007/UP045 rules. - - -@pytest.mark.parametrize("value", ["hello", 5]) -def test_union_each_member_accepted(value: Any) -> None: - assert parse_json(value, Union[str, int]) == value # noqa: UP007 - - -def test_union_pep604_syntax() -> None: - """The ``X | Y`` syntax (``types.UnionType``) is handled too.""" - assert parse_json("hello", str | int) == "hello" - assert parse_json(5, str | int) == 5 - - -def test_union_no_member_matches_raises() -> None: - with pytest.raises(ValueError, match="Expected a value matching one of"): - parse_json(1.5, Union[str, int]) # noqa: UP007 - - -def test_union_error_lists_each_member() -> None: - with pytest.raises(ValueError, match="Tried each union member"): - parse_json([], Union[str, int]) # noqa: UP007 - - -@pytest.mark.parametrize("value", [None, "x"]) -def test_optional_accepts_none_and_value(value: Any) -> None: - assert parse_json(value, Optional[str]) == value # noqa: UP045 - - -def test_optional_rejects_other_types() -> None: - with pytest.raises(ValueError, match="Expected a value matching one of"): - parse_json(5, Optional[str]) # noqa: UP045 - - -def test_union_with_coercion_returns_coerced_value() -> None: - """A union member that coerces (Sequence -> tuple) returns the coerced form.""" - result = parse_json([1, 2, 3], Union[str, Sequence[int]]) # noqa: UP007 - assert result == (1, 2, 3) - assert isinstance(result, tuple) - - -# --------------------------------------------------------------------------- -# tuple[...] -# --------------------------------------------------------------------------- - - -def test_tuple_fixed_length_valid() -> None: - result = parse_json([1, "a"], tuple[int, str]) - assert result == (1, "a") - assert isinstance(result, tuple) - - -def test_tuple_fixed_length_wrong_length_raises() -> None: - with pytest.raises(TypeError, match="Expected a sequence of length 2"): - parse_json([1], tuple[int, str]) - with pytest.raises(TypeError, match="Expected a sequence of length 2"): - parse_json([1, "a", "extra"], tuple[int, str]) - - -def test_tuple_fixed_length_wrong_element_type_raises() -> None: - # Second element should be str but is an int -> inner primitive ValueError. - with pytest.raises(ValueError, match="Expected str"): - parse_json([1, 2], tuple[int, str]) - - -def test_tuple_variadic_valid() -> None: - result = parse_json([1, 2, 3], tuple[int, ...]) - assert result == (1, 2, 3) - assert isinstance(result, tuple) - - -def test_tuple_variadic_empty() -> None: - assert parse_json([], tuple[int, ...]) == () - - -def test_tuple_variadic_wrong_element_raises() -> None: - with pytest.raises(ValueError, match="Expected int"): - parse_json([1, "x", 3], tuple[int, ...]) - - -def test_tuple_accepts_tuple_input() -> None: - assert parse_json((1, "a"), tuple[int, str]) == (1, "a") - - -def test_tuple_rejects_non_sequence() -> None: - with pytest.raises(TypeError, match="Expected a sequence"): - parse_json(5, tuple[int, ...]) - - -def test_tuple_rejects_str_input() -> None: - """A ``str`` is not treated as a sequence for tuple parsing.""" - with pytest.raises(TypeError, match="Expected a sequence"): - parse_json("ab", tuple[str, str]) - - -def test_tuple_empty_annotation() -> None: - """``tuple[()]`` matches the empty sequence.""" - assert parse_json([], tuple[()]) == () - - -# --------------------------------------------------------------------------- -# Sequence[T] / list[T] (coerced to tuple) -# --------------------------------------------------------------------------- - - -def test_sequence_coerces_to_tuple() -> None: - result = parse_json([1, 2, 3], Sequence[int]) - assert result == (1, 2, 3) - assert isinstance(result, tuple) - - -def test_list_coerces_to_tuple() -> None: - result = parse_json([1, 2, 3], list[int]) - assert result == (1, 2, 3) - assert isinstance(result, tuple) - - -def test_sequence_validates_elements() -> None: - with pytest.raises(ValueError, match="Expected int"): - parse_json([1, "two", 3], Sequence[int]) - - -def test_sequence_rejects_str() -> None: - """A bare string is a primitive in JSON terms, not a sequence.""" - with pytest.raises(TypeError, match="Expected a sequence"): - parse_json("abc", Sequence[str]) - - -def test_sequence_rejects_non_sequence() -> None: - with pytest.raises(TypeError, match="Expected a sequence"): - parse_json(42, Sequence[int]) - - -def test_sequence_of_sequence_nested() -> None: - result = parse_json([[1, 2], [3]], Sequence[Sequence[int]]) - assert result == ((1, 2), (3,)) - assert isinstance(result, tuple) - assert all(isinstance(inner, tuple) for inner in result) - - -# --------------------------------------------------------------------------- -# Mapping[str, T] / dict[str, T] -# --------------------------------------------------------------------------- - - -def test_mapping_valid_returns_dict() -> None: - result = parse_json({"a": 1, "b": 2}, Mapping[str, int]) - assert result == {"a": 1, "b": 2} - assert isinstance(result, dict) - - -def test_dict_valid_returns_dict() -> None: - result = parse_json({"a": 1}, dict[str, int]) - assert result == {"a": 1} - assert isinstance(result, dict) - - -def test_mapping_wrong_value_type_raises() -> None: - with pytest.raises(ValueError, match="Expected int"): - parse_json({"a": "not an int"}, Mapping[str, int]) - - -def test_mapping_non_str_key_raises() -> None: - with pytest.raises(TypeError, match="Expected mapping key to be str"): - parse_json({1: 1}, Mapping[str, int]) - - -def test_mapping_non_mapping_raises() -> None: - with pytest.raises(TypeError, match="Expected a mapping"): - parse_json([("a", 1)], Mapping[str, int]) - - -def test_mapping_nested_values() -> None: - result = parse_json({"a": [1, 2], "b": [3]}, Mapping[str, Sequence[int]]) - assert result == {"a": (1, 2), "b": (3,)} - assert isinstance(result, dict) - assert all(isinstance(v, tuple) for v in result.values()) - - -# --------------------------------------------------------------------------- -# TypedDict -# --------------------------------------------------------------------------- - - -class Point(TypedDict): - x: int - y: int - - -# NOTE: This module uses ``from __future__ import annotations``, which stringizes -# annotations -- so a class-syntax ``NotRequired[...]`` is invisible to the -# TypedDict's ``__optional_keys__``. ``_parse_typeddict`` handles this by -# resolving the hints with ``get_type_hints(..., include_extras=True)`` and -# reading the ``Required`` / ``NotRequired`` wrappers directly, so the class -# form below genuinely exercises NotRequired under stringized annotations. -class Config(TypedDict): - name: str - count: NotRequired[int] - - -class PartialConfig(TypedDict, total=False): - a: int - b: str - - -class Outer(TypedDict): - label: str - point: Point - - -def test_typeddict_valid() -> None: - result = parse_json({"x": 1, "y": 2}, Point) - assert result == {"x": 1, "y": 2} - assert isinstance(result, dict) - - -def test_typeddict_missing_required_key_raises() -> None: - with pytest.raises(ValueError, match="Expected required key"): - parse_json({"x": 1}, Point) - - -def test_typeddict_wrong_value_type_raises() -> None: - with pytest.raises(ValueError, match="Expected int"): - parse_json({"x": 1, "y": "two"}, Point) - - -def test_typeddict_non_mapping_raises() -> None: - with pytest.raises(TypeError, match="Expected a mapping"): - parse_json([1, 2], Point) - - -def test_typeddict_notrequired_present() -> None: - result = parse_json({"name": "a", "count": 3}, Config) - assert result == {"name": "a", "count": 3} - - -def test_typeddict_notrequired_absent() -> None: - result = parse_json({"name": "a"}, Config) - assert result == {"name": "a"} - - -def test_typeddict_notrequired_wrong_type_raises() -> None: - with pytest.raises(ValueError, match="Expected int"): - parse_json({"name": "a", "count": "x"}, Config) - - -def test_typeddict_total_false_all_optional() -> None: - # All keys optional: empty dict is valid. - assert parse_json({}, PartialConfig) == {} - assert parse_json({"a": 1}, PartialConfig) == {"a": 1} - assert parse_json({"a": 1, "b": "y"}, PartialConfig) == {"a": 1, "b": "y"} - - -def test_typeddict_total_false_validates_present_keys() -> None: - with pytest.raises(ValueError, match="Expected int"): - parse_json({"a": "not int"}, PartialConfig) - - -def test_typeddict_nested_valid() -> None: - result = parse_json({"label": "L", "point": {"x": 1, "y": 2}}, Outer) - assert result == {"label": "L", "point": {"x": 1, "y": 2}} - - -def test_typeddict_nested_invalid_recurses() -> None: - with pytest.raises(ValueError, match="Expected int"): - parse_json({"label": "L", "point": {"x": 1, "y": "bad"}}, Outer) - - -def test_typeddict_nested_missing_required_recurses() -> None: - with pytest.raises(ValueError, match="Expected required key"): - parse_json({"label": "L", "point": {"x": 1}}, Outer) - - -def test_typeddict_preserves_extra_keys() -> None: - """Extra keys not declared on the TypedDict are preserved unchanged.""" - result = parse_json({"x": 1, "y": 2, "extra": "kept"}, Point) - assert result == {"x": 1, "y": 2, "extra": "kept"} - - -# --------------------------------------------------------------------------- -# Fallback / error quality -# --------------------------------------------------------------------------- - - -def test_unsupported_annotation_raises_typeerror() -> None: - with pytest.raises(TypeError, match="unsupported type annotation"): - parse_json(1, complex) - - -def test_unsupported_annotation_names_value_and_annotation() -> None: - with pytest.raises(TypeError) as excinfo: - parse_json(object(), set[int]) - msg = str(excinfo.value) - assert "set" in msg - - -def test_error_message_names_offending_value() -> None: - """Primitive errors name the offending value via ``repr``.""" - with pytest.raises(ValueError, match=r"Expected int, got 'nope' instead"): - parse_json("nope", int) - - -def test_error_message_names_expected_literal_choices() -> None: - with pytest.raises(ValueError, match=r"Expected one of \('C', 'F'\)"): - parse_json("Q", Literal["C", "F"]) +from zarr.core.json_parse import MAX_JSON_DEPTH, convert, validate_json_value +from zarr.core.metadata.v3 import parse_storage_transformers + + +class TestConvert: + def test_literal(self) -> None: + assert convert(3, Literal[3]) == 3 + assert convert("array", Literal["array", "group"]) == "array" + + def test_literal_rejects_non_member(self) -> None: + with pytest.raises(TypeError): + convert(4, Literal[3]) + with pytest.raises(TypeError): + convert("Q", Literal["C", "F"]) + + def test_sequence_coerced_to_tuple(self) -> None: + assert convert([1, 2, 3], tuple[int, ...]) == (1, 2, 3) + assert convert([1, 2], tuple[int, int]) == (1, 2) + + def test_int(self) -> None: + assert convert(5, int) == 5 + + def test_bool_int_strictness(self) -> None: + # bool is an int subclass, but the two must not be interchangeable. + with pytest.raises(TypeError): + convert(True, int) + with pytest.raises(TypeError): + convert(1, bool) + # ... and True must not satisfy Literal[1]. + with pytest.raises(TypeError): + convert(True, Literal[1]) + + +class TestValidateJsonValue: + @pytest.mark.parametrize("value", [None, True, 1, 1.5, "s"]) + def test_primitives(self, value: object) -> None: + assert validate_json_value(value) is value + + def test_nested(self) -> None: + value = {"a": [1, 2.0, "x", True, None], "b": {"c": [{}]}} + assert validate_json_value(value) is value + + def test_rejects_non_str_keys(self) -> None: + with pytest.raises(TypeError, match="keys must be str"): + validate_json_value({1: "x"}) + + def test_rejects_non_json_leaf(self) -> None: + with pytest.raises(TypeError, match="not a valid JSON value"): + validate_json_value(object()) + with pytest.raises(TypeError, match="not a valid JSON value"): + validate_json_value({"a": object()}) + + def test_depth_limit(self) -> None: + def nest(depth: int) -> object: + v: object = "leaf" + for _ in range(depth): + v = {"k": v} + return v + + # At the limit it passes; one level deeper it is rejected. This bound is + # new behavior the previous per-field parsers never had. + assert validate_json_value(nest(MAX_JSON_DEPTH)) is not None + with pytest.raises(ValueError, match="maximum depth"): + validate_json_value(nest(MAX_JSON_DEPTH + 1)) + + +class TestStorageTransformersRegression: + """`parse_storage_transformers` used to call `len(tuple(data))` and then + return `data` itself, exhausting a one-shot iterable and returning a value + typed as a tuple but not actually a tuple.""" + + def test_none(self) -> None: + assert parse_storage_transformers(None) == () + + def test_empty(self) -> None: + assert parse_storage_transformers([]) == () + + def test_list_returns_tuple(self) -> None: + result = parse_storage_transformers([{"a": 1}]) + assert result == ({"a": 1},) + assert isinstance(result, tuple) + + def test_generator_not_exhausted(self) -> None: + result = parse_storage_transformers(iter([{"a": 1}, {"b": 2}])) + assert result == ({"a": 1}, {"b": 2}) + + def test_non_iterable_rejected(self) -> None: + with pytest.raises(TypeError, match="Expected an iterable"): + parse_storage_transformers(5) From 041bdf4a86d3930efc56dbe65366dba30ea32908 Mon Sep 17 00:00:00 2001 From: Yong-Shin Jiang Date: Mon, 29 Jun 2026 00:51:49 -0700 Subject: [PATCH 8/8] Encapsulate convert + field-context error into parse_field (#3285) Address review feedback: factor the repeated convert-then-re-raise pattern out of each per-field parser. convert now raises a field-agnostic ValueError("Expected instance of TYPE, got DATA"); the new parse_field wraps it and re-raises with field context ("Failed to parse input for FIELD") using the caller's chosen exception type, chaining the original error. Every per-field parser collapses to a one-liner. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/zarr/codecs/blosc.py | 21 ++++++------------- src/zarr/codecs/gzip.py | 7 ++----- src/zarr/codecs/zstd.py | 14 ++++--------- src/zarr/core/chunk_key_encodings.py | 7 ++----- src/zarr/core/common.py | 12 +++-------- src/zarr/core/config.py | 7 ++----- src/zarr/core/group.py | 19 +++++++---------- src/zarr/core/json_parse.py | 31 ++++++++++++++++++++++------ src/zarr/core/metadata/v2.py | 7 ++----- src/zarr/core/metadata/v3.py | 21 ++++++++----------- tests/test_common.py | 8 +++---- tests/test_json_parse.py | 30 +++++++++++++++++++++------ tests/test_metadata/test_v2.py | 2 +- 13 files changed, 91 insertions(+), 95 deletions(-) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 2a891087f0..d0458a91a0 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -104,12 +104,9 @@ class BloscCname(metaclass=_DeprecatedStrEnumMeta): def parse_typesize(data: JSON) -> int: - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - parsed: int = convert(data, int) - except (ValueError, TypeError) as exc: - raise TypeError(f"Value must be an int. Got {type(data)} instead.") from exc + parsed: int = parse_field(data, int, "typesize", error=TypeError) if parsed > 0: return parsed else: @@ -120,22 +117,16 @@ def parse_typesize(data: JSON) -> int: # todo: real validation def parse_clevel(data: JSON) -> int: - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - parsed: int = convert(data, int) - except (ValueError, TypeError) as exc: - raise TypeError(f"Value should be an int. Got {type(data)} instead.") from exc + parsed: int = parse_field(data, int, "clevel", error=TypeError) return parsed def parse_blocksize(data: JSON) -> int: - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - parsed: int = convert(data, int) - except (ValueError, TypeError) as exc: - raise TypeError(f"Value should be an int. Got {type(data)} instead.") from exc + parsed: int = parse_field(data, int, "blocksize", error=TypeError) return parsed diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index 8d0e9ca9ed..052d1e928d 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -19,12 +19,9 @@ def parse_gzip_level(data: JSON) -> int: - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - parsed: int = convert(data, int) - except (ValueError, TypeError) as exc: - raise TypeError(f"Expected int, got {type(data)}") from exc + parsed: int = parse_field(data, int, "level", error=TypeError) if parsed not in range(10): raise ValueError( f"Expected an integer from the inclusive range (0, 9). Got {parsed} instead." diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index 824e277915..92c97f20b1 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -21,24 +21,18 @@ def parse_zstd_level(data: JSON) -> int: - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - parsed: int = convert(data, int) - except (ValueError, TypeError) as exc: - raise TypeError(f"Got value with type {type(data)}, but expected an int.") from exc + parsed: int = parse_field(data, int, "level", error=TypeError) if parsed >= 23: raise ValueError(f"Value must be less than or equal to 22. Got {parsed} instead.") return parsed def parse_checksum(data: JSON) -> bool: - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - parsed: bool = convert(data, bool) - except (ValueError, TypeError) as exc: - raise TypeError(f"Expected bool. Got {type(data)}.") from exc + parsed: bool = parse_field(data, bool, "checksum", error=TypeError) return parsed diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index 20a2fe988d..7055f3930b 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -19,12 +19,9 @@ def parse_separator(data: JSON) -> SeparatorLiteral: - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - return cast("SeparatorLiteral", convert(data, Literal[".", "/"])) - except (ValueError, TypeError) as exc: - raise ValueError(f"Expected an '.' or '/' separator. Got {data} instead.") from exc + return cast("SeparatorLiteral", parse_field(data, Literal[".", "/"], "separator")) class ChunkKeyEncodingParams(TypedDict): diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 6e0096c42a..4281decbf5 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -207,21 +207,15 @@ def parse_fill_value(data: Any) -> Any: def parse_order(data: Any) -> Literal["C", "F"]: - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - return cast("Literal['C', 'F']", convert(data, Literal["C", "F"])) - except TypeError as exc: - raise ValueError(f"Expected one of ('C', 'F'), got {data!r} instead.") from exc + return cast("Literal['C', 'F']", parse_field(data, Literal["C", "F"], "order")) def parse_bool(data: Any) -> bool: from zarr.core.json_parse import convert - try: - return cast("bool", convert(data, bool)) - except TypeError as exc: - raise ValueError(f"Expected bool, got {data!r} instead.") from exc + return cast("bool", convert(data, bool)) def parse_int(data: Any) -> int: diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 0c818795b5..9752a58870 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -151,9 +151,6 @@ def enable_gpu(self) -> ConfigSet: def parse_indexing_order(data: Any) -> Literal["C", "F"]: - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - return cast("Literal['C', 'F']", convert(data, Literal["C", "F"])) - except TypeError as exc: - raise ValueError(f"Expected one of ('C', 'F'), got {data!r} instead.") from exc + return cast("Literal['C', 'F']", parse_field(data, Literal["C", "F"], "order")) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 3fce939ae6..4bf26e4b31 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -85,24 +85,19 @@ def parse_zarr_format(data: Any) -> ZarrFormat: """Parse the zarr_format field from metadata.""" - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - return cast("ZarrFormat", convert(data, Literal[2, 3])) - except (ValueError, TypeError) as exc: - msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}." - raise ValueError(msg) from exc + return cast("ZarrFormat", parse_field(data, Literal[2, 3], "zarr_format")) def parse_node_type(data: Any) -> NodeType: """Parse the node_type field from metadata.""" - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - return cast("Literal['array', 'group']", convert(data, Literal["array", "group"])) - except (ValueError, TypeError) as exc: - msg = f"Invalid value for 'node_type'. Expected 'array' or 'group'. Got '{data}'." - raise MetadataValidationError(msg) from exc + return cast( + "Literal['array', 'group']", + parse_field(data, Literal["array", "group"], "node_type", error=MetadataValidationError), + ) # todo: convert None to empty dict diff --git a/src/zarr/core/json_parse.py b/src/zarr/core/json_parse.py index 9218425cb5..c07479b146 100644 --- a/src/zarr/core/json_parse.py +++ b/src/zarr/core/json_parse.py @@ -26,24 +26,43 @@ if TYPE_CHECKING: from zarr.core.common import JSON -__all__ = ["MAX_JSON_DEPTH", "convert", "validate_json_value"] +__all__ = ["MAX_JSON_DEPTH", "convert", "parse_field", "validate_json_value"] MAX_JSON_DEPTH: Final = 64 """Maximum nesting depth accepted by :func:`validate_json_value`.""" +def _type_name(type_: Any) -> str: + return getattr(type_, "__name__", None) or str(type_).replace("typing.", "") + + def convert(value: object, type_: Any, *, strict: bool = True) -> Any: """Validate and coerce ``value`` against ``type_`` via :func:`msgspec.convert`. - msgspec handles the JSON-shaped coercions Zarr needs but raises - :class:`msgspec.ValidationError` on a mismatch. We translate that to - ``TypeError`` so callers can keep raising the exception types the rest of - Zarr already expects. + On a mismatch msgspec raises :class:`msgspec.ValidationError`; this re-raises + a plain, field-agnostic ``ValueError`` naming the expected type, so callers + can add their own field context (see :func:`parse_field`). """ try: return msgspec.convert(value, type_, strict=strict) except msgspec.ValidationError as exc: - raise TypeError(str(exc)) from exc + raise ValueError(f"Expected instance of {_type_name(type_)}, got {value!r}.") from exc + + +def parse_field( + data: object, type_: Any, field: str, *, error: type[Exception] = ValueError +) -> Any: + """Validate ``data`` for metadata field ``field`` against ``type_``. + + Wraps :func:`convert` and, on failure, re-raises ``error`` with field + context, chaining the underlying type error. This keeps the + ``convert``-then-re-raise pattern in one place rather than repeating it in + every per-field parser. + """ + try: + return convert(data, type_) + except ValueError as exc: + raise error(f"Failed to parse input for {field!r}.") from exc def validate_json_value(value: object, *, max_depth: int = MAX_JSON_DEPTH, _depth: int = 0) -> JSON: diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 1250643a27..a613842106 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -280,12 +280,9 @@ def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]: def parse_zarr_format(data: object) -> Literal[2]: from typing import Literal - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - return cast("Literal[2]", convert(data, Literal[2])) - except (ValueError, TypeError) as exc: - raise ValueError(f"Invalid value. Expected 2. Got {data}.") from exc + return cast("Literal[2]", parse_field(data, Literal[2], "zarr_format")) def parse_filters(data: object) -> tuple[Numcodec, ...] | None: diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 97d77c1cc7..a165a27586 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -47,23 +47,20 @@ def parse_zarr_format(data: object) -> Literal[3]: - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - return cast("Literal[3]", convert(data, Literal[3])) - except (ValueError, TypeError) as exc: - msg = f"Invalid value for 'zarr_format'. Expected '3'. Got '{data}'." - raise MetadataValidationError(msg) from exc + return cast( + "Literal[3]", parse_field(data, Literal[3], "zarr_format", error=MetadataValidationError) + ) def parse_node_type_array(data: object) -> Literal["array"]: - from zarr.core.json_parse import convert + from zarr.core.json_parse import parse_field - try: - return cast('Literal["array"]', convert(data, Literal["array"])) - except (ValueError, TypeError) as exc: - msg = f"Invalid value for 'node_type'. Expected 'array'. Got '{data}'." - raise NodeTypeValidationError(msg) from exc + return cast( + 'Literal["array"]', + parse_field(data, Literal["array"], "node_type", error=NodeTypeValidationError), + ) def parse_codecs(data: object) -> tuple[Codec, ...]: diff --git a/tests/test_common.py b/tests/test_common.py index caac012aa0..2f1e7c8522 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -71,20 +71,20 @@ def test_parse_name_valid(data: tuple[Any, Any]) -> None: @pytest.mark.parametrize("data", [0, 1, "hello", "f"]) def test_parse_indexing_order_invalid(data: Any) -> None: - with pytest.raises(ValueError, match="Expected one of"): + with pytest.raises(ValueError, match="Failed to parse input for 'order'"): parse_indexing_order(data) @pytest.mark.parametrize("data", [0, 1, "hello", "f"]) def test_parse_order_invalid(data: Any) -> None: - with pytest.raises(ValueError, match="Expected one of"): + with pytest.raises(ValueError, match="Failed to parse input for 'order'"): parse_order(data) @pytest.mark.parametrize("data", [0, 1, "true", None, [True]]) def test_parse_bool_invalid(data: Any) -> None: - """Non-bool values are rejected with a ValueError (preserving prior behavior).""" - with pytest.raises(ValueError, match="Expected bool"): + """Non-bool values are rejected with a ValueError.""" + with pytest.raises(ValueError, match="Expected instance of bool"): parse_bool(data) diff --git a/tests/test_json_parse.py b/tests/test_json_parse.py index a905378eeb..da723119aa 100644 --- a/tests/test_json_parse.py +++ b/tests/test_json_parse.py @@ -13,7 +13,7 @@ import pytest -from zarr.core.json_parse import MAX_JSON_DEPTH, convert, validate_json_value +from zarr.core.json_parse import MAX_JSON_DEPTH, convert, parse_field, validate_json_value from zarr.core.metadata.v3 import parse_storage_transformers @@ -23,9 +23,9 @@ def test_literal(self) -> None: assert convert("array", Literal["array", "group"]) == "array" def test_literal_rejects_non_member(self) -> None: - with pytest.raises(TypeError): + with pytest.raises(ValueError, match="Expected instance of"): convert(4, Literal[3]) - with pytest.raises(TypeError): + with pytest.raises(ValueError, match="Expected instance of"): convert("Q", Literal["C", "F"]) def test_sequence_coerced_to_tuple(self) -> None: @@ -37,15 +37,33 @@ def test_int(self) -> None: def test_bool_int_strictness(self) -> None: # bool is an int subclass, but the two must not be interchangeable. - with pytest.raises(TypeError): + with pytest.raises(ValueError): convert(True, int) - with pytest.raises(TypeError): + with pytest.raises(ValueError): convert(1, bool) # ... and True must not satisfy Literal[1]. - with pytest.raises(TypeError): + with pytest.raises(ValueError): convert(True, Literal[1]) +class TestParseField: + def test_valid_passthrough(self) -> None: + assert parse_field(3, Literal[3], "zarr_format") == 3 + + def test_wraps_with_field_context(self) -> None: + with pytest.raises(ValueError, match="Failed to parse input for 'zarr_format'"): + parse_field(4, Literal[3], "zarr_format") + + def test_custom_error_type_and_chaining(self) -> None: + class MyError(ValueError): + pass + + with pytest.raises(MyError, match="Failed to parse input for 'node_type'") as exc_info: + parse_field(5, Literal["array"], "node_type", error=MyError) + # the generic type error is chained as the cause + assert isinstance(exc_info.value.__cause__, ValueError) + + class TestValidateJsonValue: @pytest.mark.parametrize("value", [None, True, 1, 1.5, "s"]) def test_primitives(self, value: object) -> None: diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index d1a1ca00b4..d0560aacea 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -31,7 +31,7 @@ def test_parse_zarr_format_valid() -> None: @pytest.mark.parametrize("data", [None, 1, 3, 4, 5, "3"]) def test_parse_zarr_format_invalid(data: Any) -> None: - with pytest.raises(ValueError, match=f"Invalid value. Expected 2. Got {data}"): + with pytest.raises(ValueError, match="Failed to parse input for 'zarr_format'"): parse_zarr_format(data)