zarr-developers · TomNicholas · Jun 5, 2026 · Jun 10, 2026
diff --git a/docs/releases.md b/docs/releases.md
@@ -9,6 +9,11 @@
 - `open_virtual_dataset` and `open_virtual_datatree` now populate `ds.encoding["source"]` with the normalized source URI, mirroring [`xarray.open_dataset`][]'s behaviour. Parsers that have already set `encoding["source"]` are left untouched.
   By [Tom Nicholas](https://github.com/TomNicholas).
 
+### Bug fixes
+
+- The HDF parser now expresses CF `scale_factor`/`add_offset` packing as the zarr `scale_offset` + `cast_value` codecs (stripping those attributes) instead of leaving them as attributes. Files packed with different parameters therefore get different codec pipelines, so `check_same_codecs` refuses to concatenate them rather than silently keeping only the first file's parameters and corrupting decoded values for chunks sourced from other files. No concatenation code was changed. The kerchunk (zarr v2) writer reconstructs the equivalent `scale_factor`/`add_offset` attributes on write. Note that codec-based decoding (`packed / scale`) agrees with xarray's `decode_cf` (`packed * scale_factor`) only to within ~1 ULP. Closes [#1004](https://github.com/zarr-developers/VirtualiZarr/issues/1004).
+  By [Tom Nicholas](https://github.com/TomNicholas).
+
 ### Documentation
 
 - Document that virtual concatenation also requires homogeneous CF encoding (`scale_factor`/`add_offset`) across files — xarray's default attribute-merging silently drops mismatched values and produces incorrectly-decoded data on read. Added a new FAQ bullet and a warning admonition under "Combining virtual datasets" in the usage docs. See [#1004](https://github.com/zarr-developers/VirtualiZarr/issues/1004).

diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,9 @@ hdf = [
     "hdf5plugin",
     "imagecodecs",
     "imagecodecs-numcodecs==2024.6.1",
+    # CF scale/offset packing is expressed via the zarr `scale_offset` +
+    # `cast_value` codecs; the latter needs its Rust backend at parse time.
+    "cast-value-rs",
 ]
 
 zarr = ["arro3-core"]

diff --git a/virtualizarr/parsers/hdf/filters.py b/virtualizarr/parsers/hdf/filters.py
@@ -8,6 +8,7 @@
 from numcodecs.abc import Codec
 from numcodecs.fixedscaleoffset import FixedScaleOffset
 from xarray.coding.variables import _choose_float_dtype
+from zarr.codecs import CastValue, ScaleOffset
 
 from virtualizarr.utils import soft_import
 
@@ -68,6 +69,13 @@ class CFCodec(TypedDict):
     codec: Codec
 
 
+class CFCodecs(TypedDict):
+    target_dtype: np.dtype
+    codecs: list[dict]
+    scale: float
+    offset: float
+
+
 def _filter_to_codec(
     filter_id: str, filter_properties: Union[int, None, Tuple] = None
 ) -> Codec:
@@ -170,6 +178,69 @@ def cfcodec_from_dataset(dataset: Dataset) -> Codec | None:
         return None
 
 
+def cf_codecs_from_dataset(dataset: Dataset) -> CFCodecs | None:
+    """
+    Express an h5py dataset's CF scale/offset packing as zarr v3 codecs.
+
+    CF-packed data is stored as integers and decoded as
+    ``unpacked = packed * scale_factor + add_offset``. We represent that with
+    two zarr v3 codecs applied to the (decoded) float array:
+
+    - ``scale_offset`` does the arithmetic. Its decode is ``x / scale + offset``,
+      so ``scale = 1 / scale_factor`` and ``offset = add_offset``.
+    - ``cast_value`` casts between the decoded float dtype and the stored
+      integer dtype.
+
+    Returning these as codecs (rather than leaving ``scale_factor`` /
+    ``add_offset`` as attributes) means two arrays packed with different
+    parameters get different codec pipelines, so ``check_same_codecs`` refuses
+    to concatenate them instead of silently corrupting decoded values
+    (see https://github.com/zarr-developers/VirtualiZarr/issues/1004).
+
+    Parameters
+    ----------
+    dataset
+       An h5py dataset.
+
+    Returns
+    -------
+    CFCodecs or None
+        ``None`` when the dataset has no scale/offset packing.
+    """
+    attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
+    mapping = {}
+    if "scale_factor" in attributes:
+        try:
+            scale_factor = attributes["scale_factor"][0]
+        except IndexError:
+            scale_factor = attributes["scale_factor"]
+        mapping["scale_factor"] = float(1 / scale_factor)
+    else:
+        mapping["scale_factor"] = 1
+    if "add_offset" in attributes:
+        try:
+            offset = attributes["add_offset"][0]
+        except IndexError:
+            offset = attributes["add_offset"]
+        mapping["add_offset"] = float(offset)
+    else:
+        mapping["add_offset"] = 0
+    if mapping["scale_factor"] == 1 and mapping["add_offset"] == 0:
+        return None
+
+    target_dtype = np.dtype(_choose_float_dtype(dtype=dataset.dtype, mapping=mapping))
+    scale_offset = ScaleOffset(
+        scale=mapping["scale_factor"], offset=mapping["add_offset"]
+    )
+    cast_value = CastValue(data_type=dataset.dtype.name)
+    return CFCodecs(
+        target_dtype=target_dtype,
+        codecs=[scale_offset.to_dict(), cast_value.to_dict()],
+        scale=mapping["scale_factor"],
+        offset=mapping["add_offset"],
+    )
+
+
 def codecs_from_dataset(dataset: Dataset) -> List[Codec]:
     """
     Extracts a list of numcodecs from an h5py dataset

diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py
@@ -21,7 +21,7 @@
     ManifestStore,
 )
 from virtualizarr.manifests.utils import create_v3_array_metadata
-from virtualizarr.parsers.hdf.filters import codecs_from_dataset
+from virtualizarr.parsers.hdf.filters import cf_codecs_from_dataset, codecs_from_dataset
 from virtualizarr.parsers.typing import ReaderFactory
 from virtualizarr.parsers.utils import encode_cf_fill_value
 from virtualizarr.types import ChunkKey
@@ -64,23 +64,28 @@ def _construct_manifest_array(
     attrs = _extract_attrs(dataset)
     dtype = dataset.dtype
 
-    # Temporarily disable use CF->Codecs - TODO re-enable in subsequent PR.
-    # cfcodec = cfcodec_from_dataset(dataset)
-    # if cfcodec:
-    # codecs.insert(0, cfcodec["codec"])
-    # dtype = cfcodec["target_dtype"]
-    # attrs.pop("scale_factor", None)
-    # attrs.pop("add_offset", None)
-    # else:
-    # dtype = dataset.dtype
+    codec_configs = [zarr_codec_config_to_v3(codec.get_config()) for codec in codecs]
+    fill_value = dataset.fillvalue.item()
+
+    # Express CF scale/offset packing as zarr codecs rather than attributes, so
+    # arrays packed with different parameters get different codec pipelines and
+    # cannot be silently concatenated. The scale_offset/cast_value codecs run on
+    # the decoded float array, so they go before the file's byte-level filters.
+    # See https://github.com/zarr-developers/VirtualiZarr/issues/1004.
+    cfcodecs = cf_codecs_from_dataset(dataset)
+    if cfcodecs is not None:
+        codec_configs = cfcodecs["codecs"] + codec_configs
+        dtype = cfcodecs["target_dtype"]
+        # The array is now logically float (decoded), so the fill value — stored
+        # in the packed integer domain — must be decoded to match, mirroring the
+        # ScaleOffset codec's decode of `packed / scale + offset`.
+        fill_value = fill_value / cfcodecs["scale"] + cfcodecs["offset"]
+        attrs.pop("scale_factor", None)
+        attrs.pop("add_offset", None)
 
     if "_FillValue" in attrs:
         encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype)
         attrs["_FillValue"] = encoded_cf_fill_value
-
-    codec_configs = [zarr_codec_config_to_v3(codec.get_config()) for codec in codecs]
-
-    fill_value = dataset.fillvalue.item()
     dims = tuple(_dataset_dims(dataset, group=group))
     metadata = create_v3_array_metadata(
         shape=dataset.shape,

diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
@@ -10,8 +10,10 @@
 from virtualizarr.parsers import HDFParser
 from virtualizarr.tests import (
     requires_hdf5plugin,
+    requires_icechunk,
     requires_imagecodecs,
 )
+from virtualizarr.tests.test_integration import roundtrip_as_in_memory_icechunk
 from virtualizarr.tests.utils import manifest_store_from_hdf_url
 
 
@@ -228,3 +230,111 @@ def test_netcdf_over_https():
     ):
         np.testing.assert_allclose(ds["z"].min().to_numpy(), -6)
         np.testing.assert_allclose(ds["z"].max().to_numpy(), 817)
+
+
+def _write_packed_hdf5(
+    path, *, x_start, x_len, scale_factor, add_offset, fill_value=-9999
+):
+    """Write a small CF-packed int16 HDF5 file with the given scale/offset attrs."""
+    with h5py.File(path, "w") as f:
+        data = np.arange(x_start, x_start + x_len, dtype="int16").reshape(x_len, 1)
+        d = f.create_dataset("foo", data=data, chunks=(x_len, 1))
+        d.attrs["scale_factor"] = np.float64(scale_factor)
+        d.attrs["add_offset"] = np.float64(add_offset)
+        d.attrs["_FillValue"] = np.int16(fill_value)
+        x = f.create_dataset(
+            "x", data=np.arange(x_start, x_start + x_len, dtype="int32")
+        )
+        x.make_scale("x")
+        d.dims[0].attach_scale(x)
+        y = f.create_dataset("y", data=np.arange(1, dtype="int32"))
+        y.make_scale("y")
+        d.dims[1].attach_scale(y)
+
+
+@requires_hdf5plugin
+@requires_imagecodecs
+class TestConcatMismatchedCFEncoding:
+    """
+    The HDF parser expresses CF scale_factor / add_offset packing as zarr
+    `scale_offset` + `cast_value` codecs rather than as attributes. Files packed
+    with different parameters therefore get different codec pipelines, so the
+    existing `check_same_codecs` refuses to concatenate them instead of silently
+    keeping only the first file's parameters and corrupting decoded values for
+    every chunk that did not come from the first file (see
+    https://github.com/zarr-developers/VirtualiZarr/issues/1004).
+    """
+
+    def test_concat_mismatched_scale_factor_raises(self, tmp_path, local_registry):
+        p1 = tmp_path / "a.nc"
+        p2 = tmp_path / "b.nc"
+        _write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0)
+        _write_packed_hdf5(p2, x_start=4, x_len=4, scale_factor=0.01, add_offset=0.0)
+
+        parser = HDFParser()
+        with (
+            open_virtual_dataset(
+                url=f"file://{p1}", parser=parser, registry=local_registry
+            ) as vds1,
+            open_virtual_dataset(
+                url=f"file://{p2}", parser=parser, registry=local_registry
+            ) as vds2,
+        ):
+            # the codec mismatch surfaces as different ScaleOffset codecs:
+            # scale = 1 / scale_factor, so 0.1 -> 10.0 and 0.01 -> 100.0
+            with pytest.raises(NotImplementedError, match="ScaleOffset") as excinfo:
+                xr.concat([vds1, vds2], dim="x")
+            assert "scale=10.0" in str(excinfo.value)
+            assert "scale=100.0" in str(excinfo.value)
+
+    def test_concat_mismatched_add_offset_raises(self, tmp_path, local_registry):
+        p1 = tmp_path / "a.nc"
+        p2 = tmp_path / "b.nc"
+        _write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0)
+        _write_packed_hdf5(p2, x_start=4, x_len=4, scale_factor=0.1, add_offset=100.0)
+
+        parser = HDFParser()
+        with (
+            open_virtual_dataset(
+                url=f"file://{p1}", parser=parser, registry=local_registry
+            ) as vds1,
+            open_virtual_dataset(
+                url=f"file://{p2}", parser=parser, registry=local_registry
+            ) as vds2,
+        ):
+            # add_offset maps directly onto the ScaleOffset codec's offset
+            with pytest.raises(NotImplementedError, match="ScaleOffset") as excinfo:
+                xr.concat([vds1, vds2], dim="x")
+            assert "offset=100.0" in str(excinfo.value)
+
+    @requires_icechunk
+    def test_concat_matching_cf_encoding_roundtrips(self, tmp_path, local_registry):
+        # positive control: identical CF encoding must concatenate cleanly, and a
+        # write -> reopen -> decode round-trip must yield the correct unpacked
+        # values for chunks sourced from *both* files (packed = arange,
+        # decoded = packed * 0.1).
+        p1 = tmp_path / "a.nc"
+        p2 = tmp_path / "b.nc"
+        _write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0)
+        _write_packed_hdf5(p2, x_start=4, x_len=4, scale_factor=0.1, add_offset=0.0)
+
+        parser = HDFParser()
+        with (
+            open_virtual_dataset(
+                url=f"file://{p1}", parser=parser, registry=local_registry
+            ) as vds1,
+            open_virtual_dataset(
+                url=f"file://{p2}", parser=parser, registry=local_registry
+            ) as vds2,
+        ):
+            combined = xr.concat([vds1, vds2], dim="x")
+            assert combined["foo"].shape == (8, 1)
+            # scale/offset live on the codec pipeline, not as attributes
+            assert (
+                "scale_factor" not in combined["foo"].variable.data.metadata.attributes
+            )
+            assert "add_offset" not in combined["foo"].variable.data.metadata.attributes
+
+            roundtrip = roundtrip_as_in_memory_icechunk(combined, tmp_path)
+            expected = (np.arange(0, 8) * 0.1).reshape(8, 1)
+            np.testing.assert_allclose(roundtrip["foo"].values, expected)
diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
@@ -820,7 +820,15 @@ def test_loadable_variables(
 
             for name, var in ds.variables.items():
                 if name in actual_loadable_variables:
-                    xrt.assert_identical(vds.variables[name], ds.variables[name])
+                    if "scale_factor" in var.encoding or "add_offset" in var.encoding:
+                        # CF scale/offset packing is now decoded by the zarr
+                        # scale_offset codec (packed / scale) rather than by
+                        # xarray's decode_cf (packed * scale_factor). The two
+                        # agree to within ~1 ULP but are not bit-identical, so
+                        # the decoded values are compared with assert_allclose.
+                        xrt.assert_allclose(vds.variables[name], ds.variables[name])
+                    else:
+                        xrt.assert_identical(vds.variables[name], ds.variables[name])
 
     def test_group_kwarg_not_a_group(self, hdf5_groups_file, local_registry):
         parser = HDFParser(group="doesnt_exist")
@@ -846,7 +854,15 @@ def test_group_kwarg(self, hdf5_groups_file, local_registry):
         ):
             for name in full_ds.variables:
                 if name in vars_to_load:
-                    xrt.assert_identical(vds.variables[name], full_ds.variables[name])
+                    ref = full_ds.variables[name]
+                    if "scale_factor" in ref.encoding or "add_offset" in ref.encoding:
+                        # CF scale/offset packing is decoded by the zarr
+                        # scale_offset codec (packed / scale) rather than by
+                        # xarray's decode_cf (packed * scale_factor); these agree
+                        # to within ~1 ULP but are not bit-identical.
+                        xrt.assert_allclose(vds.variables[name], ref)
+                    else:
+                        xrt.assert_identical(vds.variables[name], ref)
 
     def test_open_dataset_with_empty(self, hdf5_empty, local_registry):
         parser = HDFParser()