From 9ed7c07c65725fa68237368919040b7cfac7776a Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 4 May 2026 15:17:36 -0400
Subject: [PATCH 1/5] Correctly handle HDF5 fillvalue for string dtype arrays.

---
 virtualizarr/codecs.py                        |  6 ++--
 virtualizarr/parsers/hdf/hdf.py               | 25 +++++++++++++--
 virtualizarr/tests/test_parsers/conftest.py   | 32 +++++++++++++++++++
 .../tests/test_parsers/test_hdf/test_hdf.py   | 13 ++++++++
 4 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py
index d60f2fc1..093f6ee8 100644
--- a/virtualizarr/codecs.py
+++ b/virtualizarr/codecs.py
@@ -4,7 +4,7 @@
 import zarr
 from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
 from zarr.abc.codec import Codec as ZarrCodec
-from zarr.codecs import BytesCodec
+from zarr.codecs import BytesCodec, VLenUTF8Codec
 from zarr.core.codec_pipeline import BatchedCodecPipeline
 from zarr.core.metadata.v3 import ArrayV3Metadata
 
@@ -106,7 +106,9 @@ def convert_to_codec_pipeline(
     arrayarray_codecs, arraybytes_codec, bytesbytes_codecs = extract_codecs(zarr_codecs)
 
     if arraybytes_codec is None:
-        if dtype.byteorder == ">":
+        if isinstance(dtype, np.dtypes.StringDType) or dtype.kind == "O":
+            arraybytes_codec = VLenUTF8Codec()
+        elif dtype.byteorder == ">":
             arraybytes_codec = BytesCodec(endian="big")
         else:
             arraybytes_codec = BytesCodec()
diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py
index d9cef4ff..aa2051b4 100644
--- a/virtualizarr/parsers/hdf/hdf.py
+++ b/virtualizarr/parsers/hdf/hdf.py
@@ -35,6 +35,22 @@
     from h5py import Group as H5Group
 
 
+def _get_fill_value(dataset: H5Dataset):
+    """
+    Extract the fill value from an h5py dataset, handling string/bytes dtypes
+    that don't return numpy scalars from dataset.fillvalue.
+    """
+    raw = dataset.fillvalue
+    if isinstance(raw, bytes):
+        return raw.decode("utf-8", errors="replace")
+    elif isinstance(raw, str):
+        return raw
+    elif isinstance(raw, np.generic):
+        return raw.item()
+    else:
+        return raw
+
+
 def _construct_manifest_array(
     filepath: str,
     dataset: H5Dataset,
@@ -64,6 +80,11 @@ def _construct_manifest_array(
     attrs = _extract_attrs(dataset)
     dtype = dataset.dtype
 
+    # HDF5 variable-length strings use numpy object dtype, which zarr v3 cannot
+    # resolve automatically. Map to StringDType which zarr maps to VariableLengthUTF8.
+    if dtype.kind == "O":
+        dtype = np.dtypes.StringDType()
+
     # Temporarily disable use CF->Codecs - TODO re-enable in subsequent PR.
     # cfcodec = cfcodec_from_dataset(dataset)
     # if cfcodec:
@@ -74,13 +95,13 @@ def _construct_manifest_array(
     # else:
     # dtype = dataset.dtype
 
-    if "_FillValue" in attrs:
+    if "_FillValue" in attrs and dtype.kind not in ("S", "U", "O", "T"):
         encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype)
         attrs["_FillValue"] = encoded_cf_fill_value
 
     codec_configs = [zarr_codec_config_to_v3(codec.get_config()) for codec in codecs]
 
-    fill_value = dataset.fillvalue.item()
+    fill_value = _get_fill_value(dataset)
     dims = tuple(_dataset_dims(dataset, group=group))
     metadata = create_v3_array_metadata(
         shape=dataset.shape,
diff --git a/virtualizarr/tests/test_parsers/conftest.py b/virtualizarr/tests/test_parsers/conftest.py
index 23d30a4f..6cac0261 100644
--- a/virtualizarr/tests/test_parsers/conftest.py
+++ b/virtualizarr/tests/test_parsers/conftest.py
@@ -469,6 +469,38 @@ def chunked_roundtrip_hdf5_s3_file(minio_bucket, cf_array_fill_value_hdf5_file):
     return f"s3://{minio_bucket['bucket']}/{filepath}"
 
 
+@pytest.fixture(
+    params=[
+        {"dtype": "S10", "data": np.array([b"hello", b"world"], dtype="S10")},
+        {
+            "dtype": None,
+            "data": np.array(["hello", "world"], dtype=h5py.string_dtype()),
+        },
+    ],
+    ids=["fixed-length-bytes", "variable-length-string"],
+)
+def string_dtype_hdf5_url(tmp_path: Path, request) -> str:
+    filepath = str(tmp_path / "string_dtype.nc")
+
+    with h5py.File(filepath, "w") as f:
+        f.create_dataset(name="data", data=request.param["data"])
+
+    return f"file://{filepath}"
+
+
+@pytest.fixture
+def string_dtype_with_fillvalue_hdf5_url(tmp_path: Path) -> str:
+    filepath = str(tmp_path / "string_dtype_fillvalue.nc")
+
+    with h5py.File(filepath, "w") as f:
+        dset = f.create_dataset(
+            name="data", data=np.array(["hello", "world"], dtype=h5py.string_dtype())
+        )
+        dset.attrs["_FillValue"] = ""
+
+    return f"file://{filepath}"
+
+
 @pytest.fixture()
 def big_endian_dtype_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/big_endian.nc"
diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
index 9854174f..9d59224d 100644
--- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
+++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
@@ -100,6 +100,19 @@ def test_cf_fill_value(self, cf_fill_value_hdf5_file):
         metadata = manifest_store._group.arrays["data"].metadata
         assert "_FillValue" in metadata.attributes
 
+    def test_string_dtype_fill_value(self, string_dtype_hdf5_url):
+        manifest_store = manifest_store_from_hdf_url(string_dtype_hdf5_url)
+        metadata = manifest_store._group.arrays["data"].metadata
+        assert isinstance(metadata.fill_value, (str, bytes, np.bytes_))
+
+    def test_string_dtype_cf_fill_value(self, string_dtype_with_fillvalue_hdf5_url):
+        manifest_store = manifest_store_from_hdf_url(
+            string_dtype_with_fillvalue_hdf5_url
+        )
+        metadata = manifest_store._group.arrays["data"].metadata
+        assert "_FillValue" in metadata.attributes
+        assert isinstance(metadata.attributes["_FillValue"], str)
+
     def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file):
         cf_array_fill_value_hdf5_url = f"file://{cf_array_fill_value_hdf5_file}"
         manifest_store = manifest_store_from_hdf_url(cf_array_fill_value_hdf5_url)

From ecc6c026222bf261feb7e5820be99d0e91443df6 Mon Sep 17 00:00:00 2001
From: Aimee Barciauskas <aimee@developmentseed.org>
Date: Mon, 18 May 2026 12:40:48 -0700
Subject: [PATCH 2/5] fix: handle H5Dataset missing fillvalue attribute with
 dtype-based fallback (#993)

* fix: handle H5Dataset missing fillvalue attribute with dtype-based fallback

Some h5py Dataset objects do not expose a fillvalue attribute. Guard against
AttributeError in _get_fill_value and fall back to np.ma.default_fill_value
for the dataset's dtype. Adds a unit test covering this path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* Add release note

* fix: catch RuntimeError (not AttributeError) from h5py fillvalue

h5py.Dataset.fillvalue delegates to the HDF5 C library via Cython and
raises RuntimeError for unsupported dtypes (e.g. variable-length strings,
compound types). AttributeError can never fire on a real h5py.Dataset
since fillvalue is always a defined property.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/releases.md                                |  6 ++++++
 virtualizarr/parsers/hdf/hdf.py                 |  5 ++++-
 .../tests/test_parsers/test_hdf/test_hdf.py     | 17 +++++++++++++++++
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/docs/releases.md b/docs/releases.md
index 0f1ce158..022a76aa 100644
--- a/docs/releases.md
+++ b/docs/releases.md
@@ -1,5 +1,11 @@
 # Release notes
 
+## Unreleased
+
+### Bug Fixes
+
+HDFParser now includes a `_get_fill_value` function which wraps `dataset.fillvalue` in a try/except AttributeError.
+
 ## v2.6.1 (3rd May 2026)
 
 Adds end-to-end support for inlined chunk references in `ChunkManifest` (read via Kerchunk parsers, write via Kerchunk and Icechunk writers), plus Zarr-Python 3.2.0 compatibility and several bug fixes.
diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py
index aa2051b4..d21c5f7e 100644
--- a/virtualizarr/parsers/hdf/hdf.py
+++ b/virtualizarr/parsers/hdf/hdf.py
@@ -40,7 +40,10 @@ def _get_fill_value(dataset: H5Dataset):
     Extract the fill value from an h5py dataset, handling string/bytes dtypes
     that don't return numpy scalars from dataset.fillvalue.
     """
-    raw = dataset.fillvalue
+    try:
+        raw = dataset.fillvalue
+    except RuntimeError:
+        return np.ma.default_fill_value(dataset.dtype)
     if isinstance(raw, bytes):
         return raw.decode("utf-8", errors="replace")
     elif isinstance(raw, str):
diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
index 9d59224d..fac802a6 100644
--- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
+++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
@@ -240,3 +240,20 @@ def test_netcdf_over_https():
     ):
         np.testing.assert_allclose(ds["z"].min().to_numpy(), -6)
         np.testing.assert_allclose(ds["z"].max().to_numpy(), 817)
+
+
+def test_fillvalue_runtime_error():
+    from virtualizarr.parsers.hdf.hdf import _get_fill_value
+
+    dtype = np.dtype("float32")
+
+    class _RuntimeErrorDataset:
+        @property
+        def fillvalue(self):
+            raise RuntimeError("Unable to get fill value")
+
+    dataset = _RuntimeErrorDataset()
+    dataset.dtype = dtype  # type: ignore[attr-defined]
+
+    result = _get_fill_value(dataset)
+    assert result == np.ma.default_fill_value(dtype)

From 7d41afb43c7f8a4ab231a7282e9cfc3dec192c41 Mon Sep 17 00:00:00 2001
From: Max Jones <14077947+maxrjones@users.noreply.github.com>
Date: Mon, 18 May 2026 15:41:54 -0400
Subject: [PATCH 3/5] Update virtualizarr/parsers/hdf/hdf.py

---
 virtualizarr/parsers/hdf/hdf.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py
index d21c5f7e..1de4ee72 100644
--- a/virtualizarr/parsers/hdf/hdf.py
+++ b/virtualizarr/parsers/hdf/hdf.py
@@ -85,8 +85,19 @@ def _construct_manifest_array(
 
     # HDF5 variable-length strings use numpy object dtype, which zarr v3 cannot
     # resolve automatically. Map to StringDType which zarr maps to VariableLengthUTF8.
-    if dtype.kind == "O":
+    # Discriminate against other object-kind HDF5 dtypes (vlen arrays, object/
+    # region references) that would silently be coerced to StringDType and
+    # produce garbage downstream — those aren't supported yet, so fail loudly.
+    if h5py.check_string_dtype(dtype) is not None:
         dtype = np.dtypes.StringDType()
+    elif dtype.kind == "O":
+        raise NotImplementedError(
+            f"HDF5 object dtype {dtype!r} is not a variable-length string and "
+            f"is not yet supported by HDFParser. h5py exposes vlen arrays "
+            f"(`h5py.vlen_dtype`) and object/region references "
+            f"(`h5py.ref_dtype`, `h5py.regionref_dtype`) as numpy object dtype; "
+            f"please open an issue if your file needs one of these."
+        )
 
     # Temporarily disable use CF->Codecs - TODO re-enable in subsequent PR.
     # cfcodec = cfcodec_from_dataset(dataset)

From a5138ab8b67d75365ba2e0f471a015aa4ca19cf7 Mon Sep 17 00:00:00 2001
From: Max Jones <14077947+maxrjones@users.noreply.github.com>
Date: Mon, 18 May 2026 15:44:41 -0400
Subject: [PATCH 4/5] Update changelog entry

---
 docs/releases.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/releases.md b/docs/releases.md
index 2fb0be54..e6d6cf21 100644
--- a/docs/releases.md
+++ b/docs/releases.md
@@ -9,7 +9,9 @@
 
 ### Bug fixes
 
-- HDFParser now includes a `_get_fill_value` function which wraps `dataset.fillvalue` in a try/except AttributeError.
+- HDFParser now correctly parses datasets with either no fill value or a string dtype fill value.
+  ([#988](https://github.com/zarr-developers/VirtualiZarr/pull/938)).
+  By [Sean Harkins](https://github.com/sharkinsspatial) and [Aimee Barciauskas](https://github.com/abarciauskas-bgse).
 
 ### Documentation
 

From 2e91a26b022da25498b69841eba88e30d8190138 Mon Sep 17 00:00:00 2001
From: Sean Harkins <sharkinsgis@gmail.com>
Date: Tue, 19 May 2026 11:29:16 -0400
Subject: [PATCH 5/5] Apply suggestion from @maxrjones

Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com>
---
 virtualizarr/codecs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py
index 093f6ee8..2af61fb5 100644
--- a/virtualizarr/codecs.py
+++ b/virtualizarr/codecs.py
@@ -106,7 +106,7 @@ def convert_to_codec_pipeline(
     arrayarray_codecs, arraybytes_codec, bytesbytes_codecs = extract_codecs(zarr_codecs)
 
     if arraybytes_codec is None:
-        if isinstance(dtype, np.dtypes.StringDType) or dtype.kind == "O":
+        if isinstance(dtype, np.dtypes.StringDType):
             arraybytes_codec = VLenUTF8Codec()
         elif dtype.byteorder == ">":
             arraybytes_codec = BytesCodec(endian="big")