From 9ed7c07c65725fa68237368919040b7cfac7776a Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 4 May 2026 15:17:36 -0400 Subject: [PATCH 1/5] Correctly handle HDF5 fillvalue for string dtype arrays. --- virtualizarr/codecs.py | 6 ++-- virtualizarr/parsers/hdf/hdf.py | 25 +++++++++++++-- virtualizarr/tests/test_parsers/conftest.py | 32 +++++++++++++++++++ .../tests/test_parsers/test_hdf/test_hdf.py | 13 ++++++++ 4 files changed, 72 insertions(+), 4 deletions(-) diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py index d60f2fc1..093f6ee8 100644 --- a/virtualizarr/codecs.py +++ b/virtualizarr/codecs.py @@ -4,7 +4,7 @@ import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.abc.codec import Codec as ZarrCodec -from zarr.codecs import BytesCodec +from zarr.codecs import BytesCodec, VLenUTF8Codec from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.metadata.v3 import ArrayV3Metadata @@ -106,7 +106,9 @@ def convert_to_codec_pipeline( arrayarray_codecs, arraybytes_codec, bytesbytes_codecs = extract_codecs(zarr_codecs) if arraybytes_codec is None: - if dtype.byteorder == ">": + if isinstance(dtype, np.dtypes.StringDType) or dtype.kind == "O": + arraybytes_codec = VLenUTF8Codec() + elif dtype.byteorder == ">": arraybytes_codec = BytesCodec(endian="big") else: arraybytes_codec = BytesCodec() diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py index d9cef4ff..aa2051b4 100644 --- a/virtualizarr/parsers/hdf/hdf.py +++ b/virtualizarr/parsers/hdf/hdf.py @@ -35,6 +35,22 @@ from h5py import Group as H5Group +def _get_fill_value(dataset: H5Dataset): + """ + Extract the fill value from an h5py dataset, handling string/bytes dtypes + that don't return numpy scalars from dataset.fillvalue. + """ + raw = dataset.fillvalue + if isinstance(raw, bytes): + return raw.decode("utf-8", errors="replace") + elif isinstance(raw, str): + return raw + elif isinstance(raw, np.generic): + return raw.item() + else: + return raw + + def _construct_manifest_array( filepath: str, dataset: H5Dataset, @@ -64,6 +80,11 @@ def _construct_manifest_array( attrs = _extract_attrs(dataset) dtype = dataset.dtype + # HDF5 variable-length strings use numpy object dtype, which zarr v3 cannot + # resolve automatically. Map to StringDType which zarr maps to VariableLengthUTF8. + if dtype.kind == "O": + dtype = np.dtypes.StringDType() + # Temporarily disable use CF->Codecs - TODO re-enable in subsequent PR. # cfcodec = cfcodec_from_dataset(dataset) # if cfcodec: @@ -74,13 +95,13 @@ def _construct_manifest_array( # else: # dtype = dataset.dtype - if "_FillValue" in attrs: + if "_FillValue" in attrs and dtype.kind not in ("S", "U", "O", "T"): encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype) attrs["_FillValue"] = encoded_cf_fill_value codec_configs = [zarr_codec_config_to_v3(codec.get_config()) for codec in codecs] - fill_value = dataset.fillvalue.item() + fill_value = _get_fill_value(dataset) dims = tuple(_dataset_dims(dataset, group=group)) metadata = create_v3_array_metadata( shape=dataset.shape, diff --git a/virtualizarr/tests/test_parsers/conftest.py b/virtualizarr/tests/test_parsers/conftest.py index 23d30a4f..6cac0261 100644 --- a/virtualizarr/tests/test_parsers/conftest.py +++ b/virtualizarr/tests/test_parsers/conftest.py @@ -469,6 +469,38 @@ def chunked_roundtrip_hdf5_s3_file(minio_bucket, cf_array_fill_value_hdf5_file): return f"s3://{minio_bucket['bucket']}/{filepath}" +@pytest.fixture( + params=[ + {"dtype": "S10", "data": np.array([b"hello", b"world"], dtype="S10")}, + { + "dtype": None, + "data": np.array(["hello", "world"], dtype=h5py.string_dtype()), + }, + ], + ids=["fixed-length-bytes", "variable-length-string"], +) +def string_dtype_hdf5_url(tmp_path: Path, request) -> str: + filepath = str(tmp_path / "string_dtype.nc") + + with h5py.File(filepath, "w") as f: + f.create_dataset(name="data", data=request.param["data"]) + + return f"file://{filepath}" + + +@pytest.fixture +def string_dtype_with_fillvalue_hdf5_url(tmp_path: Path) -> str: + filepath = str(tmp_path / "string_dtype_fillvalue.nc") + + with h5py.File(filepath, "w") as f: + dset = f.create_dataset( + name="data", data=np.array(["hello", "world"], dtype=h5py.string_dtype()) + ) + dset.attrs["_FillValue"] = "" + + return f"file://{filepath}" + + @pytest.fixture() def big_endian_dtype_hdf5_file(tmpdir): filepath = f"{tmpdir}/big_endian.nc" diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py index 9854174f..9d59224d 100644 --- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py @@ -100,6 +100,19 @@ def test_cf_fill_value(self, cf_fill_value_hdf5_file): metadata = manifest_store._group.arrays["data"].metadata assert "_FillValue" in metadata.attributes + def test_string_dtype_fill_value(self, string_dtype_hdf5_url): + manifest_store = manifest_store_from_hdf_url(string_dtype_hdf5_url) + metadata = manifest_store._group.arrays["data"].metadata + assert isinstance(metadata.fill_value, (str, bytes, np.bytes_)) + + def test_string_dtype_cf_fill_value(self, string_dtype_with_fillvalue_hdf5_url): + manifest_store = manifest_store_from_hdf_url( + string_dtype_with_fillvalue_hdf5_url + ) + metadata = manifest_store._group.arrays["data"].metadata + assert "_FillValue" in metadata.attributes + assert isinstance(metadata.attributes["_FillValue"], str) + def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file): cf_array_fill_value_hdf5_url = f"file://{cf_array_fill_value_hdf5_file}" manifest_store = manifest_store_from_hdf_url(cf_array_fill_value_hdf5_url) From ecc6c026222bf261feb7e5820be99d0e91443df6 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Mon, 18 May 2026 12:40:48 -0700 Subject: [PATCH 2/5] fix: handle H5Dataset missing fillvalue attribute with dtype-based fallback (#993) * fix: handle H5Dataset missing fillvalue attribute with dtype-based fallback Some h5py Dataset objects do not expose a fillvalue attribute. Guard against AttributeError in _get_fill_value and fall back to np.ma.default_fill_value for the dataset's dtype. Adds a unit test covering this path. Co-Authored-By: Claude Sonnet 4.6 * Add release note * fix: catch RuntimeError (not AttributeError) from h5py fillvalue h5py.Dataset.fillvalue delegates to the HDF5 C library via Cython and raises RuntimeError for unsupported dtypes (e.g. variable-length strings, compound types). AttributeError can never fire on a real h5py.Dataset since fillvalue is always a defined property. Co-Authored-By: Claude Sonnet 4.6 --------- Co-authored-by: Claude Sonnet 4.6 --- docs/releases.md | 6 ++++++ virtualizarr/parsers/hdf/hdf.py | 5 ++++- .../tests/test_parsers/test_hdf/test_hdf.py | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/docs/releases.md b/docs/releases.md index 0f1ce158..022a76aa 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -1,5 +1,11 @@ # Release notes +## Unreleased + +### Bug Fixes + +HDFParser now includes a `_get_fill_value` function which wraps `dataset.fillvalue` in a try/except AttributeError. + ## v2.6.1 (3rd May 2026) Adds end-to-end support for inlined chunk references in `ChunkManifest` (read via Kerchunk parsers, write via Kerchunk and Icechunk writers), plus Zarr-Python 3.2.0 compatibility and several bug fixes. diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py index aa2051b4..d21c5f7e 100644 --- a/virtualizarr/parsers/hdf/hdf.py +++ b/virtualizarr/parsers/hdf/hdf.py @@ -40,7 +40,10 @@ def _get_fill_value(dataset: H5Dataset): Extract the fill value from an h5py dataset, handling string/bytes dtypes that don't return numpy scalars from dataset.fillvalue. """ - raw = dataset.fillvalue + try: + raw = dataset.fillvalue + except RuntimeError: + return np.ma.default_fill_value(dataset.dtype) if isinstance(raw, bytes): return raw.decode("utf-8", errors="replace") elif isinstance(raw, str): diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py index 9d59224d..fac802a6 100644 --- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py @@ -240,3 +240,20 @@ def test_netcdf_over_https(): ): np.testing.assert_allclose(ds["z"].min().to_numpy(), -6) np.testing.assert_allclose(ds["z"].max().to_numpy(), 817) + + +def test_fillvalue_runtime_error(): + from virtualizarr.parsers.hdf.hdf import _get_fill_value + + dtype = np.dtype("float32") + + class _RuntimeErrorDataset: + @property + def fillvalue(self): + raise RuntimeError("Unable to get fill value") + + dataset = _RuntimeErrorDataset() + dataset.dtype = dtype # type: ignore[attr-defined] + + result = _get_fill_value(dataset) + assert result == np.ma.default_fill_value(dtype) From 7d41afb43c7f8a4ab231a7282e9cfc3dec192c41 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 18 May 2026 15:41:54 -0400 Subject: [PATCH 3/5] Update virtualizarr/parsers/hdf/hdf.py --- virtualizarr/parsers/hdf/hdf.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py index d21c5f7e..1de4ee72 100644 --- a/virtualizarr/parsers/hdf/hdf.py +++ b/virtualizarr/parsers/hdf/hdf.py @@ -85,8 +85,19 @@ def _construct_manifest_array( # HDF5 variable-length strings use numpy object dtype, which zarr v3 cannot # resolve automatically. Map to StringDType which zarr maps to VariableLengthUTF8. - if dtype.kind == "O": + # Discriminate against other object-kind HDF5 dtypes (vlen arrays, object/ + # region references) that would silently be coerced to StringDType and + # produce garbage downstream — those aren't supported yet, so fail loudly. + if h5py.check_string_dtype(dtype) is not None: dtype = np.dtypes.StringDType() + elif dtype.kind == "O": + raise NotImplementedError( + f"HDF5 object dtype {dtype!r} is not a variable-length string and " + f"is not yet supported by HDFParser. h5py exposes vlen arrays " + f"(`h5py.vlen_dtype`) and object/region references " + f"(`h5py.ref_dtype`, `h5py.regionref_dtype`) as numpy object dtype; " + f"please open an issue if your file needs one of these." + ) # Temporarily disable use CF->Codecs - TODO re-enable in subsequent PR. # cfcodec = cfcodec_from_dataset(dataset) From a5138ab8b67d75365ba2e0f471a015aa4ca19cf7 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 18 May 2026 15:44:41 -0400 Subject: [PATCH 4/5] Update changelog entry --- docs/releases.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/releases.md b/docs/releases.md index 2fb0be54..e6d6cf21 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -9,7 +9,9 @@ ### Bug fixes -- HDFParser now includes a `_get_fill_value` function which wraps `dataset.fillvalue` in a try/except AttributeError. +- HDFParser now correctly parses datasets with either no fill value or a string dtype fill value. + ([#988](https://github.com/zarr-developers/VirtualiZarr/pull/938)). + By [Sean Harkins](https://github.com/sharkinsspatial) and [Aimee Barciauskas](https://github.com/abarciauskas-bgse). ### Documentation From 2e91a26b022da25498b69841eba88e30d8190138 Mon Sep 17 00:00:00 2001 From: Sean Harkins Date: Tue, 19 May 2026 11:29:16 -0400 Subject: [PATCH 5/5] Apply suggestion from @maxrjones Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com> --- virtualizarr/codecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py index 093f6ee8..2af61fb5 100644 --- a/virtualizarr/codecs.py +++ b/virtualizarr/codecs.py @@ -106,7 +106,7 @@ def convert_to_codec_pipeline( arrayarray_codecs, arraybytes_codec, bytesbytes_codecs = extract_codecs(zarr_codecs) if arraybytes_codec is None: - if isinstance(dtype, np.dtypes.StringDType) or dtype.kind == "O": + if isinstance(dtype, np.dtypes.StringDType): arraybytes_codec = VLenUTF8Codec() elif dtype.byteorder == ">": arraybytes_codec = BytesCodec(endian="big")