zarr-developers · sharkinsspatial · May 4, 2026 · May 18, 2026 · May 18, 2026 · May 18, 2026
diff --git a/docs/releases.md b/docs/releases.md
@@ -18,6 +18,9 @@ Adds an `IcechunkParser` for reading existing icechunk repositories as virtual d
 
 ### Bug fixes
 
+- HDFParser now correctly parses datasets with either no fill value or a string dtype fill value.
+  ([#988](https://github.com/zarr-developers/VirtualiZarr/pull/938)).
+  By [Sean Harkins](https://github.com/sharkinsspatial) and [Aimee Barciauskas](https://github.com/abarciauskas-bgse).
 - Fix `vds.vz.to_icechunk()` raising `IcechunkError("invalid zarr key format")` when the manifest contains inlined chunks. The icechunk writer now always emits `c/0/0/0`-form chunk keys regardless of the manifest's stored chunk-key separator. Mainly surfaces with `IcechunkParser` (icechunk inlines small chunks aggressively); existing parsers don't produce inlined chunks and aren't affected.
   ([#991](https://github.com/zarr-developers/VirtualiZarr/pull/991)).
   By [Tom Nicholas](https://github.com/TomNicholas).

diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py
@@ -4,7 +4,7 @@
 import zarr
 from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
 from zarr.abc.codec import Codec as ZarrCodec
-from zarr.codecs import BytesCodec
+from zarr.codecs import BytesCodec, VLenUTF8Codec
 from zarr.core.codec_pipeline import BatchedCodecPipeline
 from zarr.core.metadata.v3 import ArrayV3Metadata
 
@@ -106,7 +106,9 @@ def convert_to_codec_pipeline(
     arrayarray_codecs, arraybytes_codec, bytesbytes_codecs = extract_codecs(zarr_codecs)
 
     if arraybytes_codec is None:
-        if dtype.byteorder == ">":
+        if isinstance(dtype, np.dtypes.StringDType):
+            arraybytes_codec = VLenUTF8Codec()
+        elif dtype.byteorder == ">":
             arraybytes_codec = BytesCodec(endian="big")
         else:
             arraybytes_codec = BytesCodec()

diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py
@@ -35,6 +35,25 @@
     from h5py import Group as H5Group
 
 
+def _get_fill_value(dataset: H5Dataset):
+    """
+    Extract the fill value from an h5py dataset, handling string/bytes dtypes
+    that don't return numpy scalars from dataset.fillvalue.
+    """
+    try:
+        raw = dataset.fillvalue
+    except RuntimeError:
+        return np.ma.default_fill_value(dataset.dtype)
+    if isinstance(raw, bytes):
+        return raw.decode("utf-8", errors="replace")
+    elif isinstance(raw, str):
+        return raw
+    elif isinstance(raw, np.generic):
+        return raw.item()
+    else:
+        return raw
+
+
 def _construct_manifest_array(
     filepath: str,
     dataset: H5Dataset,
@@ -64,6 +83,22 @@ def _construct_manifest_array(
     attrs = _extract_attrs(dataset)
     dtype = dataset.dtype
 
+    # HDF5 variable-length strings use numpy object dtype, which zarr v3 cannot
+    # resolve automatically. Map to StringDType which zarr maps to VariableLengthUTF8.
+    # Discriminate against other object-kind HDF5 dtypes (vlen arrays, object/
+    # region references) that would silently be coerced to StringDType and
+    # produce garbage downstream — those aren't supported yet, so fail loudly.
+    if h5py.check_string_dtype(dtype) is not None:
+        dtype = np.dtypes.StringDType()
+    elif dtype.kind == "O":
+        raise NotImplementedError(
+            f"HDF5 object dtype {dtype!r} is not a variable-length string and "
+            f"is not yet supported by HDFParser. h5py exposes vlen arrays "
+            f"(`h5py.vlen_dtype`) and object/region references "
+            f"(`h5py.ref_dtype`, `h5py.regionref_dtype`) as numpy object dtype; "
+            f"please open an issue if your file needs one of these."
+        )
+
     # Temporarily disable use CF->Codecs - TODO re-enable in subsequent PR.
     # cfcodec = cfcodec_from_dataset(dataset)
     # if cfcodec:
@@ -74,13 +109,13 @@ def _construct_manifest_array(
     # else:
     # dtype = dataset.dtype
 
-    if "_FillValue" in attrs:
+    if "_FillValue" in attrs and dtype.kind not in ("S", "U", "O", "T"):
         encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype)
         attrs["_FillValue"] = encoded_cf_fill_value
 
     codec_configs = [zarr_codec_config_to_v3(codec.get_config()) for codec in codecs]
 
-    fill_value = dataset.fillvalue.item()
+    fill_value = _get_fill_value(dataset)
     dims = tuple(_dataset_dims(dataset, group=group))
     metadata = create_v3_array_metadata(
         shape=dataset.shape,

diff --git a/virtualizarr/tests/test_parsers/conftest.py b/virtualizarr/tests/test_parsers/conftest.py
@@ -459,6 +459,38 @@ def chunked_roundtrip_hdf5_s3_file(minio_bucket, cf_array_fill_value_hdf5_file):
     return f"s3://{minio_bucket['bucket']}/{filepath}"
 
 
+@pytest.fixture(
+    params=[
+        {"dtype": "S10", "data": np.array([b"hello", b"world"], dtype="S10")},
+        {
+            "dtype": None,
+            "data": np.array(["hello", "world"], dtype=h5py.string_dtype()),
+        },
+    ],
+    ids=["fixed-length-bytes", "variable-length-string"],
+)
+def string_dtype_hdf5_url(tmp_path: Path, request) -> str:
+    filepath = str(tmp_path / "string_dtype.nc")
+
+    with h5py.File(filepath, "w") as f:
+        f.create_dataset(name="data", data=request.param["data"])
+
+    return f"file://{filepath}"
+
+
+@pytest.fixture
+def string_dtype_with_fillvalue_hdf5_url(tmp_path: Path) -> str:
+    filepath = str(tmp_path / "string_dtype_fillvalue.nc")
+
+    with h5py.File(filepath, "w") as f:
+        dset = f.create_dataset(
+            name="data", data=np.array(["hello", "world"], dtype=h5py.string_dtype())
+        )
+        dset.attrs["_FillValue"] = ""
+
+    return f"file://{filepath}"
+
+
 @pytest.fixture()
 def big_endian_dtype_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/big_endian.nc"

diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
@@ -100,6 +100,19 @@ def test_cf_fill_value(self, cf_fill_value_hdf5_file):
         metadata = manifest_store._group.arrays["data"].metadata
         assert "_FillValue" in metadata.attributes
 
+    def test_string_dtype_fill_value(self, string_dtype_hdf5_url):
+        manifest_store = manifest_store_from_hdf_url(string_dtype_hdf5_url)
+        metadata = manifest_store._group.arrays["data"].metadata
+        assert isinstance(metadata.fill_value, (str, bytes, np.bytes_))
+
+    def test_string_dtype_cf_fill_value(self, string_dtype_with_fillvalue_hdf5_url):
+        manifest_store = manifest_store_from_hdf_url(
+            string_dtype_with_fillvalue_hdf5_url
+        )
+        metadata = manifest_store._group.arrays["data"].metadata
+        assert "_FillValue" in metadata.attributes
+        assert isinstance(metadata.attributes["_FillValue"], str)
+
     def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file):
         cf_array_fill_value_hdf5_url = f"file://{cf_array_fill_value_hdf5_file}"
         manifest_store = manifest_store_from_hdf_url(cf_array_fill_value_hdf5_url)
@@ -228,3 +241,20 @@ def test_netcdf_over_https():
     ):
         np.testing.assert_allclose(ds["z"].min().to_numpy(), -6)
         np.testing.assert_allclose(ds["z"].max().to_numpy(), 817)
+
+
+def test_fillvalue_runtime_error():
+    from virtualizarr.parsers.hdf.hdf import _get_fill_value
+
+    dtype = np.dtype("float32")
+
+    class _RuntimeErrorDataset:
+        @property
+        def fillvalue(self):
+            raise RuntimeError("Unable to get fill value")
+
+    dataset = _RuntimeErrorDataset()
+    dataset.dtype = dtype  # type: ignore[attr-defined]
+
+    result = _get_fill_value(dataset)
+    assert result == np.ma.default_fill_value(dtype)