diff --git a/pyproject.toml b/pyproject.toml index 834cab0d..a55a2e4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,6 +120,7 @@ docs = [ ] dev = [ "codecov", + "hypothesis>=6.100", "mypy", "pandas-stubs", "pooch", @@ -330,6 +331,7 @@ markers = [ "network: marks test requiring internet (select with '--run-network-tests')", "slow: marks test as slow (select with '--run-slow-tests')", "minio: marks test requiring docker and minio (select with '--run-minio-tests')", + "hypothesis_tests: property-based tests using hypothesis (slower; exclude with '-m \"not hypothesis_tests\"')", ] filterwarnings = [ "ignore:Numcodecs codecs are not in the Zarr version 3 specification*:UserWarning:numcodecs" diff --git a/virtualizarr/tests/test_parsers/_fill_value_common.py b/virtualizarr/tests/test_parsers/_fill_value_common.py new file mode 100644 index 00000000..f9687c15 --- /dev/null +++ b/virtualizarr/tests/test_parsers/_fill_value_common.py @@ -0,0 +1,372 @@ +"""Shared infrastructure for parser fill-value equivalence test modules. + +Each parser has its own equivalence test module (`test_hdf_fill_value_equivalence.py`, +`test_zarr_fill_value_equivalence.py`, ...). The format-agnostic pieces live +here so adding parser N+1 means writing only the parser-specific bits: + +- a `_write_` file writer +- an `open_observed` and `open_reference` pair of callables +- format-specific dtype / fill-value strategies +- a `_DatasetSpec`-shaped dataclass with any format-specific extra fields +- the test classes themselves + +The shared infrastructure here covers: the `_UNSET` sentinel, hypothesis +profile registration, the module-level `pytestmark` items, two-layer +equivalence assertion helpers, and a generic numeric-dtype strategy. + +Test philosophy: strategies generate the full parameter range +`docs/custom_parsers.md` says the parser should support. Assertions are +plain `assert_identical` — no `xfail`, no `pytest.raises` for +known-broken combinations. Failures are TODO items. Suite green = +parser stack matches spec. +""" + +from __future__ import annotations + +import os +from typing import Any, Callable + +import numpy as np +import pytest +import xarray as xr +from hypothesis import settings +from hypothesis import strategies as st +from hypothesis.extra import numpy as npst + +# --------------------------------------------------------------------------- +# Sentinel for "this parameter was not set" — `None` is a meaningful value +# in several places (e.g. zarr's `fill_value=None` means "use default"), so +# we need a third state. +# --------------------------------------------------------------------------- + + +class _Unset: + """Sentinel for parameters that should be omitted entirely. + + None is a meaningful value for most fill/encoding knobs, so we need a + third state. Used as the default value on parser-specific + `_DatasetSpec` dataclasses so the test writer can pass `_UNSET` to + mean "don't set this attribute at all". + """ + + _instance = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __repr__(self): # pragma: no cover + return "_UNSET" + + +_UNSET = _Unset() + + +# --------------------------------------------------------------------------- +# Hypothesis profile registration and the module-level pytestmark items +# that every fill-value equivalence module should set. +# --------------------------------------------------------------------------- + + +def register_hypothesis_profiles() -> None: + """Register `ci` (max_examples=10) and `thorough` (max_examples=50) + profiles and load whichever `VIRTUALIZARR_HYPOTHESIS_PROFILE` names + (default: `thorough`). + + Idempotent — safe to call multiple times. Each fill-value equivalence + test module calls this at import time. `register_profile` overwrites + existing profiles of the same name, so re-calling is harmless. + """ + settings.register_profile("ci", deadline=None, max_examples=10) + settings.register_profile("thorough", deadline=None, max_examples=50) + settings.load_profile(os.environ.get("VIRTUALIZARR_HYPOTHESIS_PROFILE", "thorough")) + + +def base_pytestmark() -> list: + """Return the module-level pytestmark list every fill-value equivalence + module uses: the `hypothesis_tests` marker plus a filter for zarr's + "this dtype has no Zarr V3 spec" warnings. + + Returns a fresh list each call so callers can extend it without + mutating shared state. + """ + return [ + pytest.mark.hypothesis_tests, + # Several test cases deliberately exercise dtypes like S* that + # don't yet have a Zarr V3 spec; zarr-python emits an + # informational `UnstableSpecificationWarning` on every read. We + # want those dtypes covered, so silence the warning module-locally. + pytest.mark.filterwarnings("ignore::zarr.errors.UnstableSpecificationWarning"), + ] + + +# --------------------------------------------------------------------------- +# Two-layer equivalence assertion helpers. +# +# Each parser test module provides `open_observed` and `open_reference` +# callbacks that take (filepath, *, decode_cf) and return an xr.Dataset. +# The helpers below run both opens, clear the legitimately-different +# `encoding` dicts, and assert dataset identity. +# +# Attribution: when one or both opens fail, the helpers produce specific +# error messages so the developer can tell whether the failure is +# virtualizarr-specific or downstream-shared. +# --------------------------------------------------------------------------- + + +OpenCallable = Callable[..., xr.Dataset] + + +class BothEnginesFailedIdenticallyError(AssertionError): + """Both engines raised matching exceptions when opening the test file. + + Still a spec failure (the parser stack doesn't satisfy what + `docs/custom_parsers.md` says should work), but **not a + virtualizarr-specific bug** — the fix likely lives downstream + (xarray's `FillValueCoder`, zarr-python, etc.). The exception + message includes the signature so the developer can attribute the + fix correctly. + """ + + +def _exception_signature(e: BaseException) -> str: + """Stable string signature for comparing exceptions. + + Truncates the message to avoid noise from path differences and + long tracebacks. Two exceptions are considered "matching" when + their signatures are equal. + """ + msg = str(e) + # Trim — many xarray/zarr errors include long type repr lines. + if len(msg) > 200: + msg = msg[:200] + return f"{type(e).__name__}: {msg}" + + +_OPEN_OK = "ok" +_OPEN_FAIL = "fail" + + +def _open_with_capture( + open_fn: OpenCallable, filepath: str, *, decode_cf: bool +) -> tuple[str, Any]: + """Try `open_fn(filepath, decode_cf=decode_cf)`; return either + (`_OPEN_OK`, dataset) or (`_OPEN_FAIL`, exception). + """ + try: + return (_OPEN_OK, open_fn(filepath, decode_cf=decode_cf)) + except BaseException as e: # pragma: no cover — exhaustive catch + return (_OPEN_FAIL, e) + + +def _assert_pair_identical( + filepath: str, + *, + decode_cf: bool, + layer_name: str, + open_observed: OpenCallable, + open_reference: OpenCallable, +) -> None: + """Open both engines at the given decode layer and assert they agree. + + Four outcome shapes, each producing a distinct failure mode: + + - both succeed → standard `assert_identical` comparison. + - both fail with matching exception signatures → + `BothEnginesFailedIdenticallyError` (downstream-shared bug). + - both fail differently → plain `AssertionError` with both sigs. + - one fails, one succeeds → `AssertionError` naming which side + failed. + """ + obs_status, obs = _open_with_capture(open_observed, filepath, decode_cf=decode_cf) + ref_status, ref = _open_with_capture(open_reference, filepath, decode_cf=decode_cf) + + # Case: both failed. + if obs_status == _OPEN_FAIL and ref_status == _OPEN_FAIL: + obs_sig = _exception_signature(obs) + ref_sig = _exception_signature(ref) + if obs_sig == ref_sig: + raise BothEnginesFailedIdenticallyError( + f"[{layer_name}] both engines failed identically — " + f"likely a downstream-shared issue, not a virtualizarr-" + f"specific bug.\n {obs_sig}" + ) from obs + raise AssertionError( + f"[{layer_name}] both engines failed but differently — " + f"unexpected divergence.\n" + f" observed (virtualizarr): {obs_sig}\n" + f" reference: {ref_sig}" + ) from obs + + # Case: only one failed. + if obs_status == _OPEN_FAIL: + ref.close() + raise AssertionError( + f"[{layer_name}] observed (virtualizarr) failed; reference " + f"succeeded — likely a virtualizarr-specific bug.\n" + f" observed: {_exception_signature(obs)}" + ) from obs + if ref_status == _OPEN_FAIL: + obs.close() + raise AssertionError( + f"[{layer_name}] reference failed; observed (virtualizarr) " + f"succeeded — unexpected (virtualizarr accepts what the " + f"reference engine doesn't).\n" + f" reference: {_exception_signature(ref)}" + ) from ref + + # Case: both succeeded — compare. + try: + # Encoding dicts legitimately differ between engines (HDF5 chunk + # / filter encoding vs Zarr codec encoding), so clear them + # before comparing. Attribute and value comparison stays strict. + for v in obs.variables: + obs[v].encoding.clear() + for v in ref.variables: + ref[v].encoding.clear() + xr.testing.assert_identical(obs.load(), ref.load()) + finally: + obs.close() + ref.close() + + +def assert_decoded_data_identical( + filepath: str, + *, + open_observed: OpenCallable, + open_reference: OpenCallable, +) -> None: + """Assert the two engines produce identical datasets *after* xarray's + CF decoding (mask_and_scale, _FillValue masking, scale_factor/add_offset, + _Unsigned reinterpretation). + + User-facing correctness check: does the data look the same when + consumed via either engine? + """ + _assert_pair_identical( + filepath, + decode_cf=True, + layer_name="decoded-data", + open_observed=open_observed, + open_reference=open_reference, + ) + + +def assert_raw_attributes_identical( + filepath: str, + *, + open_observed: OpenCallable, + open_reference: OpenCallable, +) -> None: + """Assert the two engines produce identical raw datasets with CF decoding + *off*. + + Exposes the metadata layer directly: any attribute the parser drops, + adds, or transforms (e.g. `_FillValue` encoding mismatches, lost + `_Unsigned`, missing `missing_value`) shows up here even though the + decoded-data helper would mask it via CF decoding. + """ + _assert_pair_identical( + filepath, + decode_cf=False, + layer_name="raw-attributes", + open_observed=open_observed, + open_reference=open_reference, + ) + + +def assert_equivalent( + filepath: str, + *, + open_observed: OpenCallable, + open_reference: OpenCallable, +) -> None: + """Convenience wrapper: run both raw-attribute and decoded-data assertions.""" + assert_raw_attributes_identical( + filepath, open_observed=open_observed, open_reference=open_reference + ) + assert_decoded_data_identical( + filepath, open_observed=open_observed, open_reference=open_reference + ) + + +# --------------------------------------------------------------------------- +# Generic numeric-dtype strategies shared across parser modules. +# +# Each parser module adds format-specific extensions (HDF adds vlen-string +# and S*, TIFF restricts to GDAL-supported dtypes, etc.). +# --------------------------------------------------------------------------- + + +def base_numeric_dtype_strategy() -> st.SearchStrategy[np.dtype]: + """Sample one of the numeric dtypes every format supports: bool, + signed/unsigned int, float, complex. Excludes string and structured + dtypes — those are too format-specific to share. + """ + return st.sampled_from( + [ + np.dtype("?"), # bool + np.dtype("i1"), + np.dtype("i2"), + np.dtype("i4"), + np.dtype("i8"), + np.dtype("u1"), + np.dtype("u2"), + np.dtype("u4"), + np.dtype("u8"), + np.dtype("f4"), + np.dtype("f8"), + np.dtype("c8"), + np.dtype("c16"), + ] + ) + + +def value_in_dtype_strategy(dtype: np.dtype) -> st.SearchStrategy[Any]: + """Return a strategy producing a value compatible with `dtype`. + + Used for both `dataset_fillvalue` and `_FillValue` draws. Doing the + dtype-aware draw at strategy-build time (not via `assume()`) keeps + shrinking efficient. + + Handles the universally-supported kinds (`b`, `i`, `u`, `f`, `c`, `S`). + Parser modules that exercise additional kinds (e.g. `O` for + h5py.string_dtype()) extend this in their own helper. + """ + if dtype.kind == "b": + return st.booleans() + if dtype.kind in "iu": + info = np.iinfo(dtype) + return st.integers(min_value=int(info.min), max_value=int(info.max)) + if dtype.kind == "f": + return st.floats( + width=64 if dtype.itemsize == 8 else 32, + allow_nan=True, + allow_infinity=False, + ) + if dtype.kind == "c": + component = st.floats( + width=32 if dtype.itemsize == 8 else 64, + allow_nan=False, + allow_infinity=False, + ) + return st.builds(complex, component, component) + if dtype.kind == "S": + return st.binary(min_size=0, max_size=dtype.itemsize) + raise ValueError( + f"No fill-value strategy for dtype {dtype!r}; extend the parser " + f"module's own strategy if you need this dtype." + ) + + +def data_in_dtype_strategy( + dtype: np.dtype, shape: tuple[int, ...] +) -> st.SearchStrategy[np.ndarray]: + """Generate a numpy array of the given dtype and shape for the dtypes + handled by `value_in_dtype_strategy`. + + Parser modules whose strategies introduce additional kinds (e.g. vlen + strings) should branch in their own data-strategy helper. + """ + return npst.arrays(dtype=dtype, shape=shape) diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf_fill_value_equivalence.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf_fill_value_equivalence.py new file mode 100644 index 00000000..3a88bbf1 --- /dev/null +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf_fill_value_equivalence.py @@ -0,0 +1,895 @@ +"""Property-based equivalence tests for the HDF5 parser's fill-value handling. + +Each test writes an HDF5 file via h5py, opens it both via xarray+h5netcdf and +via xarray+virtualizarr (manifest_store_from_hdf_url), and asserts the two +loaded datasets are identical at two layers: + + 1. raw attribute layer (decode_cf=False) — exposes metadata bugs that CF + decoding would otherwise mask (e.g. dropped attributes, wrong fill-value + encoding, lost _Unsigned). + 2. decoded data layer (decode_cf=True) — the user-facing correctness + check (mask_and_scale, _FillValue masking, scale_factor/add_offset, + _Unsigned reinterpretation). + +See docs/custom_parsers.md ("Fill values" and "Packing and scaling" sections) +for the rules these tests exercise. + +The strategies in TestBasicEquivalence / TestPackedEquivalence / +TestUnsignedEquivalence generate the *full* parameter range that +`docs/custom_parsers.md` says the parser should support. Assertions are +plain `_assert_h5netcdf_virtualizarr_identical` (raw-attribute + +decoded-data) — no `xfail`, no `pytest.raises` adapter for cases that +don't work today. + +This is intentional: the suite functions as an executable specification. +Failures are TODO items. Suite green = HDFParser stack matches spec. + +Compound/structured dtypes can't be generated by the uniform-dtype +basic strategy and are covered by curated cases in `TestCompoundDtype`. + +Shared infrastructure (sentinel, hypothesis profile registration, +equivalence assertion helpers, generic numeric-dtype strategy) lives in +`virtualizarr/tests/test_parsers/_fill_value_common.py` so other +parsers can reuse it without copy-paste. + +CI throttling: +- VIRTUALIZARR_HYPOTHESIS_PROFILE=ci -> max_examples=10 (faster) +- VIRTUALIZARR_HYPOTHESIS_PROFILE=thorough -> max_examples=50 (default) +- pytest -m "not hypothesis_tests" -> skip this module entirely +""" + +from __future__ import annotations + +import string +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import h5py +import numpy as np +import pytest +import xarray as xr +from hypothesis import HealthCheck, example, given, settings +from hypothesis import strategies as st +from hypothesis.extra import numpy as npst + +from virtualizarr.tests import requires_hdf5plugin, requires_imagecodecs +from virtualizarr.tests.test_parsers._fill_value_common import ( + _UNSET, + assert_equivalent, + base_numeric_dtype_strategy, + base_pytestmark, + register_hypothesis_profiles, +) +from virtualizarr.tests.test_parsers._fill_value_common import ( + data_in_dtype_strategy as _base_data_in_dtype_strategy, +) +from virtualizarr.tests.test_parsers._fill_value_common import ( + value_in_dtype_strategy as _base_value_in_dtype_strategy, +) +from virtualizarr.tests.utils import manifest_store_from_hdf_url + +pytestmark = base_pytestmark() +register_hypothesis_profiles() + + +@dataclass +class _DatasetSpec: + """A self-contained description of one HDF5 file to write for testing. + + Every field passed to _write_hdf5 by tests should land in this dataclass + so failures shrink to a single repr-able value. + """ + + dtype: np.dtype + data: np.ndarray + dataset_fillvalue: Any = _UNSET + fill_value_attr: Any = _UNSET + scale_factor: float | None = None + add_offset: float | None = None + unsigned: bool = False + chunks: tuple[int, ...] | None = None + # Free-form attributes written verbatim to the dataset. Used to cover + # CF attributes that xarray passes through without special processing + # (missing_value, valid_range, valid_min, valid_max). The raw-attrs + # invariant catches any drop automatically. + extra_attrs: dict[str, Any] | None = None + + +def _write_hdf5( + filepath: Path | str, + *, + dtype: np.dtype, + data: np.ndarray, + dataset_fillvalue: Any = _UNSET, + fill_value_attr: Any = _UNSET, + scale_factor: float | None = None, + add_offset: float | None = None, + unsigned: bool = False, + chunks: tuple[int, ...] | None = None, + extra_attrs: dict[str, Any] | None = None, +) -> str: + """Write a single-variable HDF5 file with the requested fill-value shape. + + Returns the file:// URL for use with manifest_store_from_hdf_url. + """ + filepath = str(filepath) + + create_kwargs: dict[str, Any] = {"name": "data", "data": data, "dtype": dtype} + if chunks is not None: + create_kwargs["chunks"] = chunks + if dataset_fillvalue is not _UNSET: + create_kwargs["fillvalue"] = dataset_fillvalue + + with h5py.File(filepath, "w") as f: + dset = f.create_dataset(**create_kwargs) + if fill_value_attr is not _UNSET: + dset.attrs["_FillValue"] = fill_value_attr + if scale_factor is not None: + dset.attrs["scale_factor"] = scale_factor + if add_offset is not None: + dset.attrs["add_offset"] = add_offset + if unsigned: + dset.attrs["_Unsigned"] = "true" + if extra_attrs is not None: + for k, v in extra_attrs.items(): + dset.attrs[k] = v + # Attach a real dim scale to each axis so xarray+h5netcdf and the + # HDF parser both produce named "dim_" dims instead of + # phony_dim_0 — keeps equivalence assertions about fill values, + # not dim naming. + for axis, size in enumerate(data.shape): + scale_name = f"dim_{axis}" + dim = f.create_dataset(scale_name, data=np.arange(size)) + dim.make_scale(scale_name) + dset.dims[axis].attach_scale(dim) + + return f"file://{filepath}" + + +def _open_virtualizarr(filepath: str, *, decode_cf: bool) -> xr.Dataset: + """Open `filepath` via virtualizarr's HDFParser (manifest store) + xarray's + zarr backend. Plugged into the shared equivalence helpers as the + `open_observed` callback. + """ + manifest_store = manifest_store_from_hdf_url(f"file://{filepath}") + return xr.open_dataset( + manifest_store, + engine="zarr", + consolidated=False, + zarr_format=3, + decode_cf=decode_cf, + ) + + +def _open_h5netcdf(filepath: str, *, decode_cf: bool) -> xr.Dataset: + """Open `filepath` via xarray+h5netcdf. The reference engine for the HDF + equivalence sweep — what a user expects to see when reading the file + directly without virtualizarr. + """ + return xr.open_dataset(filepath, engine="h5netcdf", decode_cf=decode_cf) + + +def _assert_h5netcdf_virtualizarr_identical(filepath: str) -> None: + """Run both raw-attribute and decoded-data invariants against the HDF + parser. Thin wrapper around the shared helper that binds the engines. + """ + assert_equivalent( + filepath, + open_observed=_open_virtualizarr, + open_reference=_open_h5netcdf, + ) + + +def test_module_loads(): + """Smoke test: confirms the module imports, the marker registers, + and hypothesis profiles loaded without error. + """ + assert settings().max_examples in (10, 50) + + +def test_write_hdf5_basic_roundtrip(tmp_path): + """_write_hdf5 writes data, dataset.fillvalue, _FillValue attr, + and dim scale that h5py can read back.""" + url = _write_hdf5( + tmp_path / "data.nc", + dtype=np.dtype("f8"), + data=np.array([1.0, 2.0, 3.0], dtype="f8"), + dataset_fillvalue=-9999.0, + fill_value_attr=-9999.0, + chunks=(2,), + ) + assert url.startswith("file://") + + with h5py.File(url.removeprefix("file://"), "r") as f: + dset = f["data"] + np.testing.assert_array_equal(dset[:], [1.0, 2.0, 3.0]) + assert dset.fillvalue == -9999.0 + assert dset.attrs["_FillValue"] == -9999.0 + assert dset.chunks == (2,) + assert len(dset.dims[0]) == 1 + assert dset.dims[0][0].name.lstrip("/") == "dim_0" + + +def test_write_hdf5_unset_skips_fillvalue(tmp_path): + """When dataset_fillvalue=_UNSET, h5py uses its dtype default; + when fill_value_attr=_UNSET, no _FillValue attribute is written. + """ + url = _write_hdf5( + tmp_path / "data.nc", + dtype=np.dtype("i4"), + data=np.array([1, 2, 3], dtype="i4"), + # both fill axes left UNSET + ) + with h5py.File(url.removeprefix("file://"), "r") as f: + assert "_FillValue" not in f["data"].attrs + + +def test_equivalence_helper_passes_on_simple_file(tmp_path): + """A trivial float64 file with no fill values must round-trip + identically through both paths.""" + url = _write_hdf5( + tmp_path / "simple.nc", + dtype=np.dtype("f8"), + data=np.array([1.0, 2.0, 3.0, 4.0], dtype="f8"), + ) + _assert_h5netcdf_virtualizarr_identical(url.removeprefix("file://")) + + +# A small enum-label map reused for all enum-dtype samples; concrete values +# don't matter for the property test, only that the dtype carries enum +# metadata h5py preserves. +_ENUM_LABELS = {"NONE": 0, "LOW": 1, "MED": 2, "HIGH": 3} + + +def _h5py_compatible_dtype_strategy() -> st.SearchStrategy[np.dtype]: + """Sample one of the dtypes h5py can store in a dataset. + + Extends `base_numeric_dtype_strategy()` (bool/int*/uint*/float*/complex*) + with HDF5-specific kinds: + + - fixed-length bytes (`|S*`, numpy kind `S`) + - vlen utf-8 string (`h5py.string_dtype()`, numpy kind `O`) + - vlen ASCII string (`h5py.string_dtype(encoding="ascii")`, kind `O`) + - vlen array of a numeric base (`h5py.vlen_dtype(base)`, kind `O`, + each element is its own variable-length numpy array) + - enum (`h5py.enum_dtype({...}, basetype=...)`, integer kind with + label metadata that doesn't survive the JSON metadata round-trip) + + All currently expose at least one form of downstream breakage — + that's intentional: the suite *highlights* breakages rather than + hides them. The new attribution categories distinguish parser-side + failures from upstream / downstream-shared ones. + + Excluded: + + - Structured/compound dtypes — can't be parametrised by a single + `np.dtype`; covered by curated `TestCompoundDtype` cases. + - Object/region reference dtypes (`h5py.ref_dtype`, + `h5py.regionref_dtype`) — require a multi-dataset graph + (referrer + referent), not suitable for the single-dataset + writer template. Worth a separate curated module if real bugs + surface. + - Opaque dtype (`np.dtype(("V", n))`) — rare; not worth the + strategy weight today. + """ + return st.one_of( + base_numeric_dtype_strategy(), + st.sampled_from( + [ + np.dtype("S1"), + np.dtype("S5"), + np.dtype("S20"), + h5py.string_dtype(), # vlen utf-8 (kind 'O') + h5py.string_dtype(encoding="ascii"), # vlen ASCII (kind 'O') + h5py.vlen_dtype(np.dtype("i4")), # vlen array of int32 + h5py.vlen_dtype(np.dtype("f8")), # vlen array of float64 + h5py.enum_dtype(_ENUM_LABELS, basetype=np.dtype("i1")), + h5py.enum_dtype(_ENUM_LABELS, basetype=np.dtype("i4")), + ] + ), + ) + + +def _value_in_dtype_strategy(dtype: np.dtype) -> st.SearchStrategy[Any]: + """Return a strategy producing a value compatible with `dtype`. + + Delegates to `base_value_in_dtype_strategy` for the universally-supported + kinds and adds HDF-specific branches: + + - vlen utf-8 string: any unicode text + - vlen ASCII string: only ASCII codepoints (h5py rejects non-ASCII + bytes when the dtype declares ASCII encoding) + - enum: an integer from the enum's label set (h5py would accept any + int of the base dtype, but values outside the label set don't + round-trip semantically; we constrain for realism) + """ + str_info = h5py.check_string_dtype(dtype) + if str_info is not None: + if str_info.encoding == "ascii": + return st.text( + alphabet=string.ascii_letters + string.digits + " ", + min_size=0, + max_size=20, + ) + return st.text(min_size=0, max_size=20) + # Enum dtypes look numeric (kind 'i'/'u') with metadata; if metadata + # carries an enum label-map, constrain values to those labels. + enum_map = h5py.check_enum_dtype(dtype) + if enum_map is not None: + return st.sampled_from(sorted(set(enum_map.values()))) + return _base_value_in_dtype_strategy(dtype) + + +def _data_in_dtype_strategy( + dtype: np.dtype, shape: tuple[int, ...] +) -> st.SearchStrategy[np.ndarray]: + """Generate a numpy array of the given dtype and shape. + + Branches per HDF5-specific dtype because `hypothesis.extra.numpy.arrays` + doesn't handle h5py's vlen / enum dtypes. + + - vlen string (utf-8 or ASCII): build a flat list of strings and + reshape into the requested shape. + - vlen array: each element is its own length-varying numpy array + of the vlen base type. + - enum: integer storage with values drawn from the label set, then + reshape. + - everything else: delegate to the shared numeric data strategy. + """ + str_info = h5py.check_string_dtype(dtype) + if str_info is not None: + n = int(np.prod(shape)) if len(shape) else 1 + if str_info.encoding == "ascii": + text_strategy = st.text( + alphabet=string.ascii_letters + string.digits + " ", + min_size=0, + max_size=20, + ) + else: + text_strategy = st.text(min_size=0, max_size=20) + return st.lists(text_strategy, min_size=n, max_size=n).map( + lambda xs: np.array(xs, dtype=dtype).reshape(shape) + ) + + vlen_base = h5py.check_vlen_dtype(dtype) + if vlen_base is not None: + # Each element of the outer array is itself a length-varying + # numpy array of `vlen_base`. Build flat then reshape. + n = int(np.prod(shape)) if len(shape) else 1 + + @st.composite + def _vlen_array_data(draw): + lengths = draw( + st.lists( + st.integers(min_value=0, max_value=4), + min_size=n, + max_size=n, + ) + ) + arr = np.empty(n, dtype=dtype) + for i, length in enumerate(lengths): + arr[i] = draw(npst.arrays(dtype=vlen_base, shape=(length,))) + return arr.reshape(shape) + + return _vlen_array_data() + + enum_map = h5py.check_enum_dtype(dtype) + if enum_map is not None: + # Storage is the integer base; constrain values to the label set + # so the data is semantically meaningful for an enum. + labels = sorted(set(enum_map.values())) + n = int(np.prod(shape)) if len(shape) else 1 + return st.lists(st.sampled_from(labels), min_size=n, max_size=n).map( + lambda xs: np.array(xs, dtype=dtype).reshape(shape) + ) + + return _base_data_in_dtype_strategy(dtype, shape) + + +@st.composite +def _basic_dataset_strategy(draw) -> _DatasetSpec: + dtype = draw(_h5py_compatible_dtype_strategy()) + # 1- to 3-D shapes with modest sides; covers axis-naming, multi-chunk, + # and chunk-vs-shape edge cases without blowing up data volume. + shape = draw(npst.array_shapes(min_dims=1, max_dims=3, min_side=1, max_side=8)) + chunks_choice = draw(st.booleans()) + chunks = ( + tuple(draw(st.integers(min_value=1, max_value=size)) for size in shape) + if chunks_choice + else None + ) + data = draw(_data_in_dtype_strategy(dtype, shape)) + + # Vlen-array dtypes don't have a meaningful array-level fill value + # (h5py's default is an empty inner array). Skip the fill-value + # sweep for them; data-shape coverage alone is the signal here. + if h5py.check_vlen_dtype(dtype) is not None: + return _DatasetSpec(dtype=dtype, data=data, chunks=chunks) + + fill_in_dtype = draw(_value_in_dtype_strategy(dtype)) + dataset_fillvalue = draw(st.sampled_from([_UNSET, fill_in_dtype])) + fill_value_attr = draw(st.sampled_from([_UNSET, fill_in_dtype])) + return _DatasetSpec( + dtype=dtype, + data=data, + dataset_fillvalue=dataset_fillvalue, + fill_value_attr=fill_value_attr, + chunks=chunks, + ) + + +def _spec_to_url(spec: _DatasetSpec, tmp_path: Path) -> str: + """Translate a _DatasetSpec into a written file and return its path + (without file:// prefix, for use with the equivalence helper).""" + url = _write_hdf5( + tmp_path / "data.nc", + dtype=spec.dtype, + data=spec.data, + dataset_fillvalue=spec.dataset_fillvalue, + fill_value_attr=spec.fill_value_attr, + scale_factor=spec.scale_factor, + add_offset=spec.add_offset, + unsigned=spec.unsigned, + chunks=spec.chunks, + extra_attrs=spec.extra_attrs, + ) + return url.removeprefix("file://") + + +# Curated examples — each forces a specific dtype/fill combination we want +# guaranteed coverage of, separate from the random strategy draws. +_BASIC_EXAMPLES = [ + # NaN float _FillValue + _DatasetSpec( + dtype=np.dtype("f8"), + data=np.array([1.0, 2.0, 3.0], dtype="f8"), + fill_value_attr=np.nan, + ), + # uint8 with both dataset.fillvalue and matching _FillValue + _DatasetSpec( + dtype=np.dtype("u1"), + data=np.array([1, 2, 3], dtype="u1"), + dataset_fillvalue=255, + fill_value_attr=255, + ), + # bool _FillValue=True + _DatasetSpec( + dtype=np.dtype("?"), + data=np.array([True, False, True], dtype="?"), + fill_value_attr=True, + ), + # vlen utf-8 with empty-string sentinel _FillValue + _DatasetSpec( + dtype=h5py.string_dtype(), + data=np.array(["a", "b"], dtype=h5py.string_dtype()), + fill_value_attr="", + ), + # vlen utf-8 with a real sentinel value + _DatasetSpec( + dtype=h5py.string_dtype(), + data=np.array(["a", "b"], dtype=h5py.string_dtype()), + fill_value_attr="MISSING", + ), + # vlen utf-8 with no _FillValue at all + _DatasetSpec( + dtype=h5py.string_dtype(), + data=np.array(["a", "b"], dtype=h5py.string_dtype()), + ), + # S10 with empty-bytes sentinel _FillValue + _DatasetSpec( + dtype=np.dtype("S10"), + data=np.array([b"hello", b"world"], dtype="S10"), + fill_value_attr=b"", + ), + # S5 with a real sentinel _FillValue + _DatasetSpec( + dtype=np.dtype("S5"), + data=np.array([b"a", b"b"], dtype="S5"), + fill_value_attr=b"x", + ), + # ASCII vlen string with an explicit ASCII sentinel + _DatasetSpec( + dtype=h5py.string_dtype(encoding="ascii"), + data=np.array(["alpha", "beta"], dtype=h5py.string_dtype(encoding="ascii")), + fill_value_attr="MISSING", + ), + # int8-backed enum with fill = one of the labels (LOW=1) + _DatasetSpec( + dtype=h5py.enum_dtype(_ENUM_LABELS, basetype=np.dtype("i1")), + data=np.array( + [0, 1, 2, 3], dtype=h5py.enum_dtype(_ENUM_LABELS, basetype=np.dtype("i1")) + ), + fill_value_attr=np.int8(1), + ), + # int32-backed enum with no fill + _DatasetSpec( + dtype=h5py.enum_dtype(_ENUM_LABELS, basetype=np.dtype("i4")), + data=np.array( + [0, 2], dtype=h5py.enum_dtype(_ENUM_LABELS, basetype=np.dtype("i4")) + ), + ), + # vlen array of int32, no fill (vlen-array dtypes don't have one) + _DatasetSpec( + dtype=h5py.vlen_dtype(np.dtype("i4")), + data=np.array( + [ + np.array([1, 2, 3], dtype="i4"), + np.array([], dtype="i4"), + np.array([4], dtype="i4"), + ], + dtype=h5py.vlen_dtype(np.dtype("i4")), + ), + ), + # vlen array of float64, no fill + _DatasetSpec( + dtype=h5py.vlen_dtype(np.dtype("f8")), + data=np.array( + [ + np.array([1.0, 2.5], dtype="f8"), + np.array([np.nan], dtype="f8"), + ], + dtype=h5py.vlen_dtype(np.dtype("f8")), + ), + ), +] + + +_BASIC_EXAMPLE_IDS = [ + "float64-nan-fill", + "uint8-with-fill", + "bool-fill", + "vlen-empty-fill", + "vlen-sentinel-fill", + "vlen-no-fill", + "S10-empty-fill", + "S5-sentinel-fill", + "ascii-vlen-sentinel-fill", + "enum-i1-with-fill", + "enum-i4-no-fill", + "vlen-array-i4", + "vlen-array-f8", +] + + +@requires_hdf5plugin +@requires_imagecodecs +class TestBasicEquivalence: + """Equivalence sweep across the dtype × fill matrix. + + The random-draw strategy generates the *full* parameter range that + `docs/custom_parsers.md` says the parser should support — including + string-kind dtypes with explicit `_FillValue`, vlen-utf-8 with and + without fill, etc. + + Curated examples and random draws are **separate test methods** so + pytest-xdist can parallelise the curated cases and a single failing + example doesn't shadow the others. Each curated example shows as + its own test ID (`test_equivalence_curated[]`); random draws + aggregate under `test_equivalence_random`. + + The assertion is plain `_assert_h5netcdf_virtualizarr_identical`. + Cases the parser/stack doesn't handle correctly today **will fail**; + those failures are the to-fix list. Suite green = fixes complete. + """ + + @pytest.mark.parametrize("spec", _BASIC_EXAMPLES, ids=_BASIC_EXAMPLE_IDS) + def test_equivalence_curated(self, spec, tmp_path): + filepath = _spec_to_url(spec, tmp_path) + _assert_h5netcdf_virtualizarr_identical(filepath) + + # tmp_path is function-scoped, but each hypothesis example writes the + # same `data.nc` and reads it before the next example runs. The + # overwrite-in-place is intentional. + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) + @given(spec=_basic_dataset_strategy()) + def test_equivalence_random(self, spec, tmp_path): + filepath = _spec_to_url(spec, tmp_path) + _assert_h5netcdf_virtualizarr_identical(filepath) + + +# String-dtype known failures are now covered by _BASIC_EXAMPLES (curated +# entries) + random draws (via _h5py_compatible_dtype_strategy) inside +# TestBasicEquivalence. The `_expected_failure_for(spec)` predicate +# asserts the precise failure mode, so a change there is detected. +# No separate test class needed. + + +@st.composite +def _packed_dataset_strategy(draw) -> _DatasetSpec: + """Packed-data scenarios: integer storage + scale_factor + add_offset + + _FillValue in the packed (encoded) domain. + + Per docs/custom_parsers.md "Fill values in packed data": the + _FillValue must be in the packed domain; xarray applies CFMaskCoder + before CFScaleOffsetCoder on decode. + """ + dtype = draw( + st.sampled_from( + [ + np.dtype("i2"), + np.dtype("i4"), + np.dtype("u2"), + np.dtype("u4"), + ] + ) + ) + scale_factor = draw( + st.floats(min_value=1e-3, max_value=1e3, allow_nan=False, allow_infinity=False) + ) + add_offset = draw( + st.floats(min_value=-1e3, max_value=1e3, allow_nan=False, allow_infinity=False) + ) + shape = draw(npst.array_shapes(min_dims=1, max_dims=3, min_side=1, max_side=8)) + data = draw(npst.arrays(dtype=dtype, shape=shape)) + fill_value_attr = draw(_value_in_dtype_strategy(dtype)) + return _DatasetSpec( + dtype=dtype, + data=data, + fill_value_attr=fill_value_attr, + scale_factor=scale_factor, + add_offset=add_offset, + ) + + +_PACKED_EXAMPLES = [ + # int16 packed: scale=0.01, offset=0, _FillValue=-9999 (canonical CF case) + _DatasetSpec( + dtype=np.dtype("i2"), + data=np.array([100, 200, 300], dtype="i2"), + fill_value_attr=np.int16(-9999), + scale_factor=0.01, + add_offset=0.0, + ), + # uint16 packed: scale=0.1, offset=1000, _FillValue=0 + _DatasetSpec( + dtype=np.dtype("u2"), + data=np.array([10, 20, 30], dtype="u2"), + fill_value_attr=np.uint16(0), + scale_factor=0.1, + add_offset=1000.0, + ), +] + + +@requires_hdf5plugin +@requires_imagecodecs +class TestPackedEquivalence: + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) + @given(spec=_packed_dataset_strategy()) + @example(spec=_PACKED_EXAMPLES[0]) + @example(spec=_PACKED_EXAMPLES[1]) + def test_equivalence(self, spec, tmp_path): + filepath = _spec_to_url(spec, tmp_path) + _assert_h5netcdf_virtualizarr_identical(filepath) + + +@st.composite +def _unsigned_dataset_strategy(draw) -> _DatasetSpec: + """Signed-integer storage with _Unsigned="true" attribute. + + Per docs/custom_parsers.md "_Unsigned": fill value is interpreted in + the unsigned domain; e.g. _FillValue=-1 stored as int16 corresponds + to 65535 when reinterpreted as uint16. + """ + dtype = draw( + st.sampled_from( + [ + np.dtype("i1"), + np.dtype("i2"), + np.dtype("i4"), + np.dtype("i8"), + ] + ) + ) + shape = draw(npst.array_shapes(min_dims=1, max_dims=3, min_side=1, max_side=8)) + data = draw(npst.arrays(dtype=dtype, shape=shape)) + fill_value_attr = draw(_value_in_dtype_strategy(dtype)) + return _DatasetSpec( + dtype=dtype, + data=data, + fill_value_attr=fill_value_attr, + unsigned=True, + ) + + +_UNSIGNED_EXAMPLES = [ + # int16 stored, reinterpreted as uint16; _FillValue=-1 -> 65535 + _DatasetSpec( + dtype=np.dtype("i2"), + data=np.array([1, 2, 3], dtype="i2"), + fill_value_attr=np.int16(-1), + unsigned=True, + ), +] + + +@requires_hdf5plugin +@requires_imagecodecs +class TestUnsignedEquivalence: + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) + @given(spec=_unsigned_dataset_strategy()) + @example(spec=_UNSIGNED_EXAMPLES[0]) + def test_equivalence(self, spec, tmp_path): + filepath = _spec_to_url(spec, tmp_path) + _assert_h5netcdf_virtualizarr_identical(filepath) + + +# Compound/structured dtypes don't fit the orthogonal axes of the basic / +# packed / _Unsigned strategies (no integer/float kind, no scale_factor +# semantics, no signed-vs-unsigned reinterpretation). They're also too +# combinatorially varied to draw from a hypothesis strategy without heavy +# `assume()` filtering. A handful of curated examples is enough. +_COMPOUND_DTYPES_AND_FILLS = [ + ( + np.dtype([("a", "i4"), ("b", "f8")]), + np.array([(1, 1.0), (2, 2.0), (3, 3.0)], dtype=[("a", "i4"), ("b", "f8")]), + np.array((0, 0.0), dtype=[("a", "i4"), ("b", "f8")]), + ), + ( + np.dtype([("x", "u2"), ("y", "u2"), ("z", "u2")]), + np.array( + [(10, 20, 30), (40, 50, 60)], + dtype=[("x", "u2"), ("y", "u2"), ("z", "u2")], + ), + np.array( + (65535, 65535, 65535), + dtype=[("x", "u2"), ("y", "u2"), ("z", "u2")], + ), + ), +] + + +@requires_hdf5plugin +@requires_imagecodecs +class TestCompoundDtype: + """Compound/structured-dtype datasets with _FillValue. + + Compound dtypes can't be generated by the uniform-dtype basic + strategy (no single `np.dtype` to sample); they're covered here as + curated parametrized cases. Assertion is plain equivalence — any + failure is a real issue to fix. + """ + + @pytest.mark.parametrize( + "dtype,data,fill_value_attr", + _COMPOUND_DTYPES_AND_FILLS, + ids=["two-field", "three-field"], + ) + def test_equivalence(self, dtype, data, fill_value_attr, tmp_path): + spec = _DatasetSpec(dtype=dtype, data=data, fill_value_attr=fill_value_attr) + filepath = _spec_to_url(spec, tmp_path) + _assert_h5netcdf_virtualizarr_identical(filepath) + + +# Cross-axis interactions: the basic / packed / _Unsigned classes are +# deliberately orthogonal so each hypothesis strategy stays clean, but +# real files combine multiple features. These curated examples cover the +# combinations most likely to harbor latent bugs: packed + chunked + fill, +# _Unsigned + chunked + fill, multi-dim + packed + fill, multi-dim + _Unsigned. +_COMBINED_EXAMPLES = [ + # packed + chunked + _DatasetSpec( + dtype=np.dtype("i2"), + data=np.arange(100, dtype="i2"), + fill_value_attr=np.int16(-9999), + scale_factor=0.01, + add_offset=0.0, + chunks=(10,), + ), + # _Unsigned + chunked + explicit dataset.fillvalue + _DatasetSpec( + dtype=np.dtype("i2"), + data=np.arange(64, dtype="i2"), + dataset_fillvalue=np.int16(-1), + fill_value_attr=np.int16(-1), + unsigned=True, + chunks=(8,), + ), + # 2D packed + chunked + _DatasetSpec( + dtype=np.dtype("i4"), + data=np.arange(24, dtype="i4").reshape(4, 6), + fill_value_attr=np.int32(-32768), + scale_factor=0.001, + add_offset=-100.0, + chunks=(2, 3), + ), + # 3D _Unsigned + _DatasetSpec( + dtype=np.dtype("i1"), + data=np.arange(8, dtype="i1").reshape(2, 2, 2), + fill_value_attr=np.int8(-1), + unsigned=True, + ), + # packed + dataset.fillvalue in packed domain matching _FillValue + _DatasetSpec( + dtype=np.dtype("u2"), + data=np.array([10, 20, 30], dtype="u2"), + dataset_fillvalue=np.uint16(65535), + fill_value_attr=np.uint16(65535), + scale_factor=0.1, + add_offset=0.0, + ), +] + + +@requires_hdf5plugin +@requires_imagecodecs +class TestCombinedEquivalence: + @pytest.mark.parametrize( + "spec", + _COMBINED_EXAMPLES, + ids=[ + "packed+chunked", + "unsigned+chunked+fill", + "2D+packed+chunked", + "3D+unsigned", + "packed+matching-fill", + ], + ) + def test_equivalence(self, spec, tmp_path): + filepath = _spec_to_url(spec, tmp_path) + _assert_h5netcdf_virtualizarr_identical(filepath) + + +# Extra-CF-attribute scenarios: xarray does not specially process these +# attributes; the parser MUST round-trip them as-is. The raw-attrs +# invariant (decode_cf=False) is what surfaces a drop or transformation +# directly; the decoded-data invariant covers the user-facing path. +_EXTRA_ATTR_EXAMPLES = [ + # missing_value alongside _FillValue + _DatasetSpec( + dtype=np.dtype("f4"), + data=np.array([1.0, 2.0, 3.0], dtype="f4"), + fill_value_attr=np.float32(-9999.0), + extra_attrs={"missing_value": np.float32(-9998.0)}, + ), + # valid_range as a 2-element array + _DatasetSpec( + dtype=np.dtype("i2"), + data=np.array([10, 20, 30], dtype="i2"), + extra_attrs={"valid_range": np.array([0, 100], dtype="i2")}, + ), + # valid_min / valid_max as separate scalars + _DatasetSpec( + dtype=np.dtype("f8"), + data=np.array([0.5, 1.0, 1.5], dtype="f8"), + extra_attrs={"valid_min": 0.0, "valid_max": 2.0}, + ), + # missing_value + valid_range together + _DatasetSpec( + dtype=np.dtype("i4"), + data=np.array([1, 2, 3], dtype="i4"), + fill_value_attr=np.int32(-1), + extra_attrs={ + "missing_value": np.int32(-2), + "valid_range": np.array([0, 1000], dtype="i4"), + }, + ), +] + + +@requires_hdf5plugin +@requires_imagecodecs +class TestExtraAttributes: + """Pass-through of missing_value / valid_range / valid_min / valid_max. + + xarray does not specially process these CF attributes — the parser + must preserve them verbatim. The raw-attrs assertion catches any drop + or transformation directly. + """ + + @pytest.mark.parametrize( + "spec", + _EXTRA_ATTR_EXAMPLES, + ids=[ + "missing_value+fill", + "valid_range", + "valid_min+valid_max", + "missing_value+valid_range", + ], + ) + def test_equivalence(self, spec, tmp_path): + filepath = _spec_to_url(spec, tmp_path) + _assert_h5netcdf_virtualizarr_identical(filepath) diff --git a/virtualizarr/tests/test_parsers/test_zarr_fill_value_equivalence.py b/virtualizarr/tests/test_parsers/test_zarr_fill_value_equivalence.py new file mode 100644 index 00000000..cb48a301 --- /dev/null +++ b/virtualizarr/tests/test_parsers/test_zarr_fill_value_equivalence.py @@ -0,0 +1,410 @@ +"""Property-based equivalence tests for the ZarrParser's fill-value handling. + +Each test writes a Zarr v3 store via zarr-python, then opens it two ways: + + 1. `xr.open_zarr(path)` directly (reference) — the user's "no virtualizarr" + baseline. + 2. virtualizarr's `ZarrParser` → `ManifestStore` → `xr.open_dataset(..., + engine="zarr")` (observed) — the under-test path. + +Both go through xarray's zarr backend ultimately; we're asserting that +virtualizarr's manifest-construction layer is transparent. Failures +indicate the parser is either dropping/transforming metadata or +mis-resolving fill values during manifest construction. + +Test philosophy mirrors the HDF equivalence module: strategies generate +the full parameter range that should work; assertions are plain (no +`xfail`, no `pytest.raises`). Failures are TODO items. + +Concrete issue coverage: + +- **#811** (`ZarrParser get_metadata fails for LocalStore Zarr with + dtype uint8`): a curated example with `dtype=uint8` and no explicit + fill_value exercises the default-fill-lookup path that crashed in + the bug report. +- **Cross-parser fill-value consistency:** the random strategy + generates every numeric dtype with various fill-value shapes, + surfacing any divergence from xarray's direct view. + +Shared infrastructure (sentinel, hypothesis profiles, equivalence +helpers, generic dtype strategies) lives in +`virtualizarr/tests/test_parsers/_fill_value_common.py`. + +CI throttling: +- VIRTUALIZARR_HYPOTHESIS_PROFILE=ci -> max_examples=10 (faster) +- VIRTUALIZARR_HYPOTHESIS_PROFILE=thorough -> max_examples=50 (default) +- pytest -m "not hypothesis_tests" -> skip this module entirely +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import numpy as np +import pytest +import xarray as xr +import zarr +from hypothesis import HealthCheck, given, settings +from hypothesis import strategies as st +from hypothesis.extra import numpy as npst +from obspec_utils.registry import ObjectStoreRegistry +from obstore.store import LocalStore + +from virtualizarr.parsers import ZarrParser +from virtualizarr.tests import requires_arro3 +from virtualizarr.tests.test_parsers._fill_value_common import ( + _UNSET, + assert_equivalent, + base_numeric_dtype_strategy, + base_pytestmark, + data_in_dtype_strategy, + register_hypothesis_profiles, + value_in_dtype_strategy, +) + +pytestmark = base_pytestmark() + [requires_arro3] +register_hypothesis_profiles() + + +# --------------------------------------------------------------------------- +# _DatasetSpec — Zarr-flavoured. Compared to the HDF one this is much +# simpler: no h5py `dataset.fillvalue` / explicit-vs-default distinction +# (zarr v3 always has a fill_value), no _Unsigned, no scale_factor/add_offset +# in the metadata (those would be plain attributes if present). +# --------------------------------------------------------------------------- + + +@dataclass +class _DatasetSpec: + """A self-contained description of one Zarr v3 store to write for testing. + + Fields: + dtype: the array's numeric dtype. + data: the data to write. + fill_value: zarr's storage-level `fill_value`. `_UNSET` means "let + zarr-python pick the dtype default" (which is what triggers + #811 for uint8). + fill_value_attr: the CF `_FillValue` attribute on the array. Set + independently of `fill_value` — they're allowed to disagree + in CF. + chunks: explicit chunk shape; None means single-chunk. + extra_attrs: free-form CF-style attributes to round-trip + (`missing_value`, `valid_range`, etc.). + """ + + dtype: np.dtype + data: np.ndarray + fill_value: Any = _UNSET + fill_value_attr: Any = _UNSET + chunks: tuple[int, ...] | None = None + extra_attrs: dict[str, Any] | None = None + + +def _write_zarr( + filepath: Path | str, + *, + dtype: np.dtype, + data: np.ndarray, + fill_value: Any = _UNSET, + fill_value_attr: Any = _UNSET, + chunks: tuple[int, ...] | None = None, + extra_attrs: dict[str, Any] | None = None, +) -> str: + """Write a single-variable Zarr v3 store with the requested shape. + + Returns the file:// URL for use with the ZarrParser. Coordinate + variables (one per dim) are written too so xarray's zarr backend + builds a proper Dataset with named dims rather than anonymous ones. + """ + filepath = str(filepath) + store = zarr.storage.LocalStore(filepath) + root = zarr.group(store=store, zarr_format=3) + + dimension_names = tuple(f"dim_{i}" for i in range(data.ndim)) + chunk_shape = chunks if chunks is not None else data.shape + + # zarr-python's `create_array` requires a fill_value. Passing + # `fill_value=None` would explicitly write null; to exercise the + # "user didn't specify" code path (issue #811) we want to omit the + # kwarg entirely. + arr_kwargs: dict[str, Any] = { + "name": "data", + "shape": data.shape, + "dtype": dtype, + "chunks": chunk_shape, + "dimension_names": dimension_names, + } + if fill_value is not _UNSET: + arr_kwargs["fill_value"] = fill_value + arr = root.create_array(**arr_kwargs) + arr[...] = data + + # zarr v3 attributes are stored in JSON, which doesn't accept numpy + # scalars. Real users storing CF-style fill values write Python + # primitives (int / float / str / list), so coerce here — this isn't + # a fidelity loss for the equivalence test, just matches the user + # contract. (How virtualizarr's parsers handle numpy-scalar attrs + # they read from h5py / etc. is a separate concern — see Issue #715.) + def _to_json_safe(v: Any) -> Any: + if isinstance(v, np.generic): + return v.item() + if isinstance(v, np.ndarray): + return v.tolist() + return v + + if fill_value_attr is not _UNSET: + arr.attrs["_FillValue"] = _to_json_safe(fill_value_attr) + if extra_attrs is not None: + for k, v in extra_attrs.items(): + arr.attrs[k] = _to_json_safe(v) + + # Write coordinate arrays for each dim so xarray sees a proper + # named-dim Dataset rather than just a bare DataArray. + for axis, (dim_name, size) in enumerate(zip(dimension_names, data.shape)): + coord = root.create_array( + name=dim_name, + shape=(size,), + dtype="i8", + chunks=(size,), + dimension_names=(dim_name,), + ) + coord[:] = np.arange(size) + + return f"file://{filepath}" + + +# --------------------------------------------------------------------------- +# Open callbacks: virtualizarr-via-ZarrParser (observed) vs xarray-direct +# (reference). Both ultimately go through xarray's zarr backend; the +# difference is whether virtualizarr's manifest layer sits in between. +# --------------------------------------------------------------------------- + + +def _open_virtualizarr(filepath: str, *, decode_cf: bool) -> xr.Dataset: + """Open `filepath` via virtualizarr's ZarrParser. Plugged into the + shared equivalence helpers as the `open_observed` callback. + """ + url = f"file://{filepath}" + store = LocalStore(prefix=filepath) + registry = ObjectStoreRegistry({url: store}) + parser = ZarrParser() + manifest_store = parser(url=url, registry=registry) + return xr.open_dataset( + manifest_store, + engine="zarr", + consolidated=False, + zarr_format=3, + decode_cf=decode_cf, + ) + + +def _open_zarr_direct(filepath: str, *, decode_cf: bool) -> xr.Dataset: + """Open `filepath` directly via `xr.open_zarr` — no virtualizarr. The + reference for the equivalence sweep. + """ + return xr.open_dataset( + filepath, + engine="zarr", + consolidated=False, + zarr_format=3, + decode_cf=decode_cf, + ) + + +def _assert_zarrparser_xarray_identical(filepath: str) -> None: + """Run both raw-attribute and decoded-data invariants against the Zarr + parser. Thin wrapper that binds the engines. + """ + assert_equivalent( + filepath, + open_observed=_open_virtualizarr, + open_reference=_open_zarr_direct, + ) + + +# --------------------------------------------------------------------------- +# Sanity tests — confirm the writer and equivalence helper work on a +# trivially-trivial file before any hypothesis runs. +# --------------------------------------------------------------------------- + + +def test_write_zarr_basic_roundtrip(tmp_path): + """`_write_zarr` produces a Zarr store with the expected shape, dtype, + and fill_value that zarr-python can read back.""" + url = _write_zarr( + tmp_path / "data.zarr", + dtype=np.dtype("f8"), + data=np.array([1.0, 2.0, 3.0], dtype="f8"), + fill_value=-9999.0, + chunks=(2,), + ) + assert url.startswith("file://") + + arr = zarr.open_array( + store=zarr.storage.LocalStore(url.removeprefix("file://")), + path="data", + mode="r", + ) + np.testing.assert_array_equal(arr[:], [1.0, 2.0, 3.0]) + assert arr.metadata.fill_value == -9999.0 + + +def test_equivalence_helper_passes_on_simple_zarr(tmp_path): + """A trivial float64 store should round-trip identically through both + open paths.""" + url = _write_zarr( + tmp_path / "simple.zarr", + dtype=np.dtype("f8"), + data=np.array([1.0, 2.0, 3.0, 4.0], dtype="f8"), + fill_value=0.0, + ) + _assert_zarrparser_xarray_identical(url.removeprefix("file://")) + + +# --------------------------------------------------------------------------- +# Random-draw strategy and basic equivalence sweep. +# +# Strategy generates the full numeric dtype matrix from +# base_numeric_dtype_strategy() (bool / signed int / unsigned int / float / +# complex) plus several shapes / fill-value patterns. Failures from +# random draws are real signals to investigate; the curated `@example`s +# pin specific bug-report cases (#811). +# --------------------------------------------------------------------------- + + +@st.composite +def _basic_zarr_dataset_strategy(draw) -> _DatasetSpec: + dtype = draw(base_numeric_dtype_strategy()) + shape = draw(npst.array_shapes(min_dims=1, max_dims=3, min_side=1, max_side=8)) + chunks_choice = draw(st.booleans()) + chunks = ( + tuple(draw(st.integers(min_value=1, max_value=size)) for size in shape) + if chunks_choice + else None + ) + data = draw(data_in_dtype_strategy(dtype, shape)) + fill_in_dtype = draw(value_in_dtype_strategy(dtype)) + # zarr v3 requires fill_value to be of the array dtype (or None for + # numeric "default zero"). We sample three options: omit entirely + # (triggers default-fill path — #811), set to a typed value, or set + # to a typed value and *also* expose it as a CF _FillValue attr. + fill_value = draw(st.sampled_from([_UNSET, fill_in_dtype])) + fill_value_attr = draw(st.sampled_from([_UNSET, fill_in_dtype])) + return _DatasetSpec( + dtype=dtype, + data=data, + fill_value=fill_value, + fill_value_attr=fill_value_attr, + chunks=chunks, + ) + + +def _spec_to_url(spec: _DatasetSpec, tmp_path: Path) -> str: + """Translate a `_DatasetSpec` to a written Zarr store and return its + bare path (no file:// prefix), for use with the equivalence helper. + """ + url = _write_zarr( + tmp_path / "data.zarr", + dtype=spec.dtype, + data=spec.data, + fill_value=spec.fill_value, + fill_value_attr=spec.fill_value_attr, + chunks=spec.chunks, + extra_attrs=spec.extra_attrs, + ) + return url.removeprefix("file://") + + +# Curated examples. Each pins a specific bug-report case or scenario we +# want guaranteed coverage of, separate from the random draws. +_BASIC_EXAMPLES = [ + # Issue #811: uint8 with no explicit fill_value — exercises the + # default-fill-lookup path that crashed `get_metadata` in the bug + # report. The Zarr v3 default for uint8 is 0; both engines should + # agree. + _DatasetSpec( + dtype=np.dtype("u1"), + data=np.array([1, 2, 3], dtype="u1"), + ), + # uint8 *with* an explicit fill_value — same dtype, but bypassing + # the default-lookup path. If only the default-lookup branch is + # broken, this passes while the no-fill case fails — useful signal. + _DatasetSpec( + dtype=np.dtype("u1"), + data=np.array([1, 2, 3], dtype="u1"), + fill_value=np.uint8(255), + ), + # NaN-valued _FillValue on float64. Tests NaN equality semantics + # through the parser. + _DatasetSpec( + dtype=np.dtype("f8"), + data=np.array([1.0, 2.0, 3.0], dtype="f8"), + fill_value=np.nan, + fill_value_attr=np.nan, + ), + # int32 with both fill_value and matching CF _FillValue. + _DatasetSpec( + dtype=np.dtype("i4"), + data=np.array([10, 20, 30], dtype="i4"), + fill_value=np.int32(-9999), + fill_value_attr=np.int32(-9999), + ), + # bool array with explicit fill_value=True. + _DatasetSpec( + dtype=np.dtype("?"), + data=np.array([True, False, True], dtype="?"), + fill_value=True, + ), +] + + +_BASIC_EXAMPLE_IDS = [ + "uint8-no-fill", # #811 reproducer + "uint8-with-fill", + "float64-nan-fill", + "int32-matching-fill", + "bool-fill", +] + + +def _ensure_clean_zarr_path(tmp_path: Path) -> None: + """Zarr's LocalStore needs an empty directory each draw. Tear down + any previous `data.zarr` so the writer starts clean. + """ + store_path = tmp_path / "data.zarr" + if store_path.exists(): + import shutil + + shutil.rmtree(store_path) + + +@requires_arro3 +class TestBasicEquivalence: + """Equivalence sweep across the Zarr numeric dtype × fill matrix. + + Curated examples and random draws are split into separate test + methods so pytest-xdist can parallelise the curated cases and a + single failing example doesn't shadow the others. + + Assertion is plain `_assert_zarrparser_xarray_identical`. Cases the + parser/stack doesn't handle correctly today **will fail** — those + failures are the to-fix list. + """ + + @pytest.mark.parametrize("spec", _BASIC_EXAMPLES, ids=_BASIC_EXAMPLE_IDS) + def test_equivalence_curated(self, spec, tmp_path): + _ensure_clean_zarr_path(tmp_path) + filepath = _spec_to_url(spec, tmp_path) + _assert_zarrparser_xarray_identical(filepath) + + # tmp_path is function-scoped, but each hypothesis example writes the + # same `data.zarr` and reads it before the next example runs. The + # overwrite-in-place is intentional. + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) + @given(spec=_basic_zarr_dataset_strategy()) + def test_equivalence_random(self, spec, tmp_path): + _ensure_clean_zarr_path(tmp_path) + filepath = _spec_to_url(spec, tmp_path) + _assert_zarrparser_xarray_identical(filepath) diff --git a/virtualizarr/tests/test_parsers/test_zarr_spec_compliance.py b/virtualizarr/tests/test_parsers/test_zarr_spec_compliance.py new file mode 100644 index 00000000..e4ed5d74 --- /dev/null +++ b/virtualizarr/tests/test_parsers/test_zarr_spec_compliance.py @@ -0,0 +1,268 @@ +"""Zarr-spec compliance tests for the ZarrParser — xarray NOT in the loop. + +Asserts that virtualizarr's ZarrParser produces ManifestStores whose +metadata matches what zarr-python reads from the same source store. +**xarray is deliberately absent from the assertion path** — the +"reference" is zarr-python itself. + +Why this exists alongside `test_zarr_fill_value_equivalence.py`: + +The equivalence test compares `virtualizarr+xarray.open_zarr` against +`xarray.open_zarr` direct. When both engines hit xarray's +`FillValueCoder.decode` on a value the coder can't handle (e.g. +`_FillValue=NaN` as a JSON-native number), *both* sides fail with the +same exception — and the test correctly reports it as +`BothEnginesFailedIdenticallyError`. But that "failure" isn't a +virtualizarr bug: the parser is producing zarr-spec-compliant metadata, +and xarray's coder is what can't handle it. + +The root cause is that xarray's `_FillValue` encoding uses HDF5-style +base64 even when writing to Zarr (whose metadata is JSON-native). The +fix lives upstream in xarray (tracked at pydata/xarray#11332) — or in +a future zarr-native nodata convention (zarr-specs#351, +zarr-extensions#33). + +By testing against zarr-python directly, this module: + +1. Asserts virtualizarr correctness independent of xarray's quirks. +2. Stays green even when the equivalence module is red on the same + spec — surfacing that the gap is upstream, not in virtualizarr. +3. Establishes the "virtualizarr produces zarr-spec-compliant + manifests" contract as the primary correctness check. + +Tests here are deliberately simple: open via parser, open via +zarr-python, compare metadata fields directly. No hypothesis sweeps +(the equivalence module covers the dtype × fill matrix); these are +property-asserting curated cases. + +Shared infra (pytestmark, hypothesis profile) is imported from +`_fill_value_common.py` for consistency. +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import zarr +from obspec_utils.registry import ObjectStoreRegistry +from obstore.store import LocalStore + +from virtualizarr.parsers import ZarrParser +from virtualizarr.tests import requires_arro3 +from virtualizarr.tests.test_parsers._fill_value_common import ( + _UNSET, + base_pytestmark, + register_hypothesis_profiles, +) + +pytestmark = base_pytestmark() + [requires_arro3] +register_hypothesis_profiles() + + +# --------------------------------------------------------------------------- +# Writer + opener helpers — minimal, since we're not sweeping over a matrix. +# --------------------------------------------------------------------------- + + +def _write_zarr_simple( + filepath, + *, + shape: tuple[int, ...], + dtype: Any, + fill_value: Any = _UNSET, + attrs: dict[str, Any] | None = None, +) -> str: + """Write a single-variable Zarr v3 store; return its file:// URL. + + Coordinate arrays are created for each dim so the store is + well-formed. + """ + store = zarr.storage.LocalStore(str(filepath)) + root = zarr.group(store=store, zarr_format=3) + dimension_names = tuple(f"dim_{i}" for i in range(len(shape))) + + arr_kwargs: dict[str, Any] = { + "name": "data", + "shape": shape, + "dtype": dtype, + "chunks": shape, + "dimension_names": dimension_names, + } + if fill_value is not _UNSET: + arr_kwargs["fill_value"] = fill_value + arr = root.create_array(**arr_kwargs) + arr[...] = np.zeros(shape, dtype=dtype) + + if attrs: + for k, v in attrs.items(): + arr.attrs[k] = v + + for i, size in enumerate(shape): + coord = root.create_array( + name=dimension_names[i], + shape=(size,), + dtype="i8", + chunks=(size,), + dimension_names=(dimension_names[i],), + ) + coord[:] = np.arange(size) + + return f"file://{filepath}" + + +def _open_manifest(url: str): + """Open the store via virtualizarr's ZarrParser; return a ManifestStore.""" + bare = url.removeprefix("file://") + store = LocalStore(prefix=bare) + registry = ObjectStoreRegistry({url: store}) + return ZarrParser()(url=url, registry=registry) + + +def _open_reference(url: str) -> zarr.Group: + """Open the store directly with zarr-python.""" + return zarr.open_group(url.removeprefix("file://"), zarr_format=3, mode="r") + + +def _manifest_data_metadata(ms): + """Reach into the ManifestStore for the `data` array's metadata. + + `_group` is the (currently underscore-prefixed) accessor used by + existing tests in `test_zarr.py` and `test_hdf.py`. If/when a + public accessor lands, update here in one place. + """ + return ms._group.arrays["data"].metadata + + +# --------------------------------------------------------------------------- +# Tests — each asserts one spec-compliance property. +# --------------------------------------------------------------------------- + + +@requires_arro3 +class TestZarrSpecCompliance: + """Virtualizarr's ZarrParser produces metadata that matches what + zarr-python reads from the same source. No xarray involvement. + """ + + def test_dtype_preserved(self, tmp_path): + url = _write_zarr_simple(tmp_path / "data.zarr", shape=(3,), dtype="f8") + ms_meta = _manifest_data_metadata(_open_manifest(url)) + ref_meta = _open_reference(url)["data"].metadata + assert ms_meta.data_type == ref_meta.data_type + + def test_shape_and_chunks_preserved(self, tmp_path): + url = _write_zarr_simple(tmp_path / "data.zarr", shape=(4, 6), dtype="i4") + ms_meta = _manifest_data_metadata(_open_manifest(url)) + ref_meta = _open_reference(url)["data"].metadata + assert ms_meta.shape == ref_meta.shape + assert ms_meta.chunk_grid == ref_meta.chunk_grid + + def test_default_fill_value_matches_zarr_python(self, tmp_path): + """When no explicit fill_value is set, the manifest store's + `fill_value` should match whatever zarr-python defaults to + (typically 0 for numeric dtypes). + + Also asserts the original #811 path: uint8 with no explicit + fill_value doesn't crash and produces sensible metadata. + """ + url = _write_zarr_simple(tmp_path / "data.zarr", shape=(3,), dtype="u1") + ms_meta = _manifest_data_metadata(_open_manifest(url)) + ref_meta = _open_reference(url)["data"].metadata + assert ms_meta.fill_value == ref_meta.fill_value + + def test_nan_fill_value_preserved(self, tmp_path): + """`fill_value=NaN` on a float dtype round-trips: zarr-python and + virtualizarr's manifest agree (both store NaN; xarray's + ability to decode it is a separate question, out of scope here). + """ + url = _write_zarr_simple( + tmp_path / "data.zarr", + shape=(3,), + dtype="f8", + fill_value=float("nan"), + ) + ms_fv = _manifest_data_metadata(_open_manifest(url)).fill_value + ref_fv = _open_reference(url)["data"].metadata.fill_value + assert np.isnan(ms_fv) and np.isnan(ref_fv) + + def test_fill_value_attr_is_json_native(self, tmp_path): + """A `_FillValue` attribute stored as a plain JSON number stays a + plain JSON number in the manifest — virtualizarr doesn't + re-encode it into HDF5-style base64 just because xarray's + `FillValueCoder` would. + + This is the test that distinguishes virtualizarr's contract from + xarray's: zarr metadata is JSON; scalar attributes should use + JSON-native types. + """ + url = _write_zarr_simple( + tmp_path / "data.zarr", + shape=(3,), + dtype="f4", + attrs={"_FillValue": 0.0}, + ) + attrs = _manifest_data_metadata(_open_manifest(url)).attributes + assert "_FillValue" in attrs + assert attrs["_FillValue"] == 0.0 + assert not isinstance(attrs["_FillValue"], (bytes, bytearray, np.ndarray)), ( + f"_FillValue should be a JSON-native scalar, got {type(attrs['_FillValue']).__name__}" + ) + + def test_attrs_match_zarr_python(self, tmp_path): + """Free-form CF-style attributes (missing_value, valid_range, + etc.) round-trip with the same JSON shape zarr-python sees. + """ + url = _write_zarr_simple( + tmp_path / "data.zarr", + shape=(3,), + dtype="f4", + attrs={ + "_FillValue": 0.0, + "valid_min": -10.0, + "valid_max": 10.0, + "missing_value": -9999.0, + }, + ) + ms_attrs = dict(_manifest_data_metadata(_open_manifest(url)).attributes) + ref_attrs = dict(_open_reference(url)["data"].attrs) + # Filter out any zarr-internal book-keeping keys (none today, + # but defensive — these tests shouldn't break on future zarr + # versions adding metadata). + for k in ("_ARRAY_DIMENSIONS",): + ms_attrs.pop(k, None) + ref_attrs.pop(k, None) + assert ms_attrs == ref_attrs + + +class TestZarrParserVsXarrayContrast: + """One-test demonstration of the framing shift: a `_FillValue` value + that virtualizarr produces spec-compliantly but xarray's + `FillValueCoder.decode` can't consume. + + Green test here + red test in `test_zarr_fill_value_equivalence.py` + on the same fixture proves: the gap is upstream of virtualizarr, + not in the parser. + """ + + def test_nan_fill_value_attr_zarr_compliant_but_xarray_breaks(self, tmp_path): + """`_FillValue=NaN` as a JSON-native value: virtualizarr's + ManifestStore preserves it correctly (assertion below passes). + xarray's `FillValueCoder.decode` on the same store raises + `TypeError: Failed to decode fill_value: expected str or bytes + for dtype float64, got float` — see the equivalence module's + `test_equivalence_curated[float64-nan-fill]` failure. The two + outcomes together establish the upstream attribution. + """ + url = _write_zarr_simple( + tmp_path / "data.zarr", + shape=(3,), + dtype="f8", + fill_value=float("nan"), + attrs={"_FillValue": float("nan")}, + ) + attrs = _manifest_data_metadata(_open_manifest(url)).attributes + assert "_FillValue" in attrs + # Plain Python float, not base64-encoded bytes. + assert isinstance(attrs["_FillValue"], float) + assert np.isnan(attrs["_FillValue"])