From bccb2cb6a6f1d769d62ac9b4c4bce4ac308a435b Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 26 May 2026 14:52:29 -0400 Subject: [PATCH 1/5] test(manifests): add failing tests for CF encoding mismatch on concat Two virtual datasets whose source files were CF-packed with different scale_factor / add_offset attributes are currently silently concatenable - np.concatenate keeps only the first array's attrs, corrupting decoded values for every chunk that came from a later file. Encode the expected behaviour (raise ValueError) so the fix can target it. --- .../tests/test_parsers/test_hdf/test_hdf.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py index 9c7442c4..00270f7c 100644 --- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py @@ -228,3 +228,91 @@ def test_netcdf_over_https(): ): np.testing.assert_allclose(ds["z"].min().to_numpy(), -6) np.testing.assert_allclose(ds["z"].max().to_numpy(), 817) + + +def _write_packed_hdf5( + path, *, x_start, x_len, scale_factor, add_offset, fill_value=-9999 +): + """Write a small CF-packed int16 HDF5 file with the given scale/offset attrs.""" + with h5py.File(path, "w") as f: + data = np.arange(x_start, x_start + x_len, dtype="int16").reshape(x_len, 1) + d = f.create_dataset("foo", data=data, chunks=(x_len, 1)) + d.attrs["scale_factor"] = np.float64(scale_factor) + d.attrs["add_offset"] = np.float64(add_offset) + d.attrs["_FillValue"] = np.int16(fill_value) + x = f.create_dataset( + "x", data=np.arange(x_start, x_start + x_len, dtype="int32") + ) + x.make_scale("x") + d.dims[0].attach_scale(x) + y = f.create_dataset("y", data=np.arange(1, dtype="int32")) + y.make_scale("y") + d.dims[1].attach_scale(y) + + +@requires_hdf5plugin +@requires_imagecodecs +class TestConcatMismatchedCFEncoding: + """ + Concatenating virtual datasets whose source files were CF-packed with + different scale_factor / add_offset / _FillValue must not silently keep + only the first file's attrs — doing so corrupts decoded values for every + chunk that did not come from the first file. + """ + + def test_concat_mismatched_scale_factor_raises(self, tmp_path, local_registry): + p1 = tmp_path / "a.nc" + p2 = tmp_path / "b.nc" + _write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0) + _write_packed_hdf5(p2, x_start=4, x_len=4, scale_factor=0.01, add_offset=0.0) + + parser = HDFParser() + with ( + open_virtual_dataset( + url=f"file://{p1}", parser=parser, registry=local_registry + ) as vds1, + open_virtual_dataset( + url=f"file://{p2}", parser=parser, registry=local_registry + ) as vds2, + ): + with pytest.raises(ValueError, match="scale_factor"): + xr.concat([vds1, vds2], dim="x") + + def test_concat_mismatched_add_offset_raises(self, tmp_path, local_registry): + p1 = tmp_path / "a.nc" + p2 = tmp_path / "b.nc" + _write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0) + _write_packed_hdf5(p2, x_start=4, x_len=4, scale_factor=0.1, add_offset=100.0) + + parser = HDFParser() + with ( + open_virtual_dataset( + url=f"file://{p1}", parser=parser, registry=local_registry + ) as vds1, + open_virtual_dataset( + url=f"file://{p2}", parser=parser, registry=local_registry + ) as vds2, + ): + with pytest.raises(ValueError, match="add_offset"): + xr.concat([vds1, vds2], dim="x") + + def test_concat_matching_cf_encoding_succeeds(self, tmp_path, local_registry): + # positive control: identical CF encoding must still concatenate cleanly + p1 = tmp_path / "a.nc" + p2 = tmp_path / "b.nc" + _write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0) + _write_packed_hdf5(p2, x_start=4, x_len=4, scale_factor=0.1, add_offset=0.0) + + parser = HDFParser() + with ( + open_virtual_dataset( + url=f"file://{p1}", parser=parser, registry=local_registry + ) as vds1, + open_virtual_dataset( + url=f"file://{p2}", parser=parser, registry=local_registry + ) as vds2, + ): + combined = xr.concat([vds1, vds2], dim="x") + assert combined["foo"].shape == (8, 1) + assert combined["foo"].attrs["scale_factor"] == 0.1 + assert combined["foo"].attrs["add_offset"] == 0.0 From 13a0b97dca596e8bdd9d49c1c9c17d37cd2f0ac5 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 26 May 2026 15:39:06 -0400 Subject: [PATCH 2/5] fix(manifests): detect mismatched CF encoding attrs on concat Two virtual datasets whose source files were CF-packed with different scale_factor / add_offset / _FillValue / missing_value silently concatenated, with np.concatenate keeping only the first array's attrs. After the next write + xr.open_zarr + decode_cf, the surviving attrs were applied to every chunk - including chunks that were packed with a different value, silently corrupting decoded values for everything sourced from non-first files. Move CF decoding attributes from the wrapping xr.Variable onto the inner ManifestArray.metadata.attributes (where np.concatenate dispatch can inspect them), and have check_combinable_zarr_arrays raise on any mismatch across inputs. Arbitrary metadata still lives on xr.Variable.attrs. Writers (icechunk, kerchunk) gain a small helper extract_cf_encoding_attrs(var) so they can pull the CF attrs back out when materializing the destination store's metadata, keeping the round-trip identical. --- virtualizarr/manifests/array.py | 38 +++++++++++---- virtualizarr/manifests/utils.py | 46 +++++++++++++++++++ virtualizarr/tests/test_parsers/test_dmrpp.py | 8 +++- .../tests/test_parsers/test_hdf/test_hdf.py | 9 ++-- virtualizarr/writers/icechunk.py | 7 ++- virtualizarr/writers/kerchunk.py | 3 +- virtualizarr/xarray.py | 27 +++++++++++ 7 files changed, 123 insertions(+), 15 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index fdbe3592..2272bc14 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -338,10 +338,19 @@ def to_virtual_variable(self) -> xr.Variable: Create a "virtual" xarray.Variable containing the contents of one zarr array. The returned variable will be "virtual", i.e. it will wrap a single ManifestArray object. + + Attributes are split by role: + + - CF decoding attributes (`scale_factor`, `add_offset`, `_FillValue`, + `missing_value`) stay on the inner ManifestArray. This is the + canonical home: a value mismatch across concat inputs would + silently corrupt decoded chunks, so it must be detectable at the + ManifestArray layer where np.concatenate dispatches. + - All other attributes are treated as arbitrary metadata and move + onto the wrapping `xr.Variable`, matching xarray's data model + for attrs. """ - # The xarray data model stores dimension names and arbitrary extra metadata outside of the wrapped array class, - # so to avoid that information being duplicated we strip it from the ManifestArray before wrapping it. if self.metadata.dimension_names is not None: dims = self.metadata.dimension_names elif self.ndim == 0: @@ -350,16 +359,27 @@ def to_virtual_variable(self) -> xr.Variable: raise ValueError( f"Cannot create virtual variable from {self.ndim}-dimensional array without dimension names." ) - attrs = self.metadata.attributes - stripped_metadata = utils.copy_and_replace_metadata( - self.metadata, new_dimension_names=None, new_attributes={} + + cf_attrs = { + k: v + for k, v in self.metadata.attributes.items() + if k in utils.CF_ENCODING_ATTRS + } + var_attrs = { + k: v + for k, v in self.metadata.attributes.items() + if k not in utils.CF_ENCODING_ATTRS + } + + split_metadata = utils.copy_and_replace_metadata( + self.metadata, new_dimension_names=None, new_attributes=cf_attrs ) - stripped_marr = ManifestArray( - chunkmanifest=self.manifest, metadata=stripped_metadata + split_marr = ManifestArray( + chunkmanifest=self.manifest, metadata=split_metadata ) return xr.Variable( - data=stripped_marr, + data=split_marr, dims=dims, - attrs=attrs, + attrs=var_attrs, ) diff --git a/virtualizarr/manifests/utils.py b/virtualizarr/manifests/utils.py index 8e3fc874..4d737a2f 100644 --- a/virtualizarr/manifests/utils.py +++ b/virtualizarr/manifests/utils.py @@ -219,6 +219,46 @@ def check_compatible_encodings(encoding1, encoding2): ) +# CF-convention attributes that control how packed chunk values are decoded. +# These live on `ManifestArray.metadata.attributes` rather than on the +# wrapping `xr.Variable.attrs`, so that mismatched values across concat +# inputs can be detected by `check_combinable_zarr_arrays` before the +# silent "first array wins" inheritance corrupts decoded values. +# Kept in sync with `virtualizarr.writers.icechunk.ENCODING_KEYS`. +CF_ENCODING_ATTRS = frozenset( + {"_FillValue", "missing_value", "scale_factor", "add_offset"} +) + + +def check_same_cf_encoding_attrs( + attrs_list: list[dict[str, Any]], +) -> None: + """Check that CF decoding-relevant attributes agree across all arrays.""" + first_attrs, *other_attrs_list = attrs_list + for other_attrs in other_attrs_list: + for key in CF_ENCODING_ATTRS: + in_first = key in first_attrs + in_other = key in other_attrs + if in_first != in_other: + present, missing = ( + ("first", "other") if in_first else ("other", "first") + ) + raise ValueError( + f"Cannot concatenate arrays with inconsistent CF encoding: " + f"attribute {key!r} is present on the {present} array but " + f"missing on the {missing} array. Mixing chunks with and " + f"without {key} would silently corrupt decoded values." + ) + if in_first and first_attrs[key] != other_attrs[key]: + raise ValueError( + f"Cannot concatenate arrays with inconsistent CF encoding " + f"attribute {key!r}: {first_attrs[key]!r} vs " + f"{other_attrs[key]!r}. Applying one value's decoding to " + f"chunks packed with the other would silently corrupt " + f"decoded values." + ) + + def check_same_codecs(codecs: list[Any]) -> None: first_codec, *other_codecs = codecs for codec in other_codecs: @@ -325,6 +365,8 @@ def check_combinable_zarr_arrays( The downside of the ManifestArray approach compared to the VirtualZarrArray concatenation proposal is that the result must also be a single valid zarr array, implying that the inputs must have the same dtype, codec etc. """ + arrays = list(arrays) + check_same_dtypes([arr.dtype for arr in arrays]) # Can't combine different codecs in one manifest @@ -334,6 +376,10 @@ def check_combinable_zarr_arrays( # Would require variable-length chunks ZEP check_same_chunk_shapes([arr.chunks for arr in arrays]) + # Mismatched CF decoding attrs across inputs would silently corrupt values + # once the concatenated array is decoded by xarray. + check_same_cf_encoding_attrs([dict(arr.metadata.attributes) for arr in arrays]) + def check_compatible_arrays( ma: "ManifestArray", existing_array: "Array", append_axis: int diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index 19bbda8e..964e82be 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -597,7 +597,13 @@ def test_dmrpp_empty_scalar_warns_container(fill_value_scalar_no_chunks_nc4_url) with pytest.warns(UserWarning): parsed_vds = parsed_dmrpp.parse_dataset(object_store=store) vds_g1 = parsed_vds.to_virtual_dataset() - assert vds_g1["data"].attrs == {"_FillValue": -999} + # _FillValue is a CF decoding attribute, which lives on the inner + # ManifestArray rather than on the Variable's arbitrary attrs (see + # ManifestArray.to_virtual_variable). + assert vds_g1["data"].attrs == {} + assert vds_g1["data"].variable.data.metadata.attributes == { + "_FillValue": -999 + } def test_dmrpp_phony_dim_naming(): diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py index 00270f7c..5779a606 100644 --- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py @@ -297,7 +297,9 @@ def test_concat_mismatched_add_offset_raises(self, tmp_path, local_registry): xr.concat([vds1, vds2], dim="x") def test_concat_matching_cf_encoding_succeeds(self, tmp_path, local_registry): - # positive control: identical CF encoding must still concatenate cleanly + # positive control: identical CF encoding must still concatenate cleanly, + # and the decoding attrs must survive on the concatenated ManifestArray + # so that the next writer can materialize them in the destination store. p1 = tmp_path / "a.nc" p2 = tmp_path / "b.nc" _write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0) @@ -314,5 +316,6 @@ def test_concat_matching_cf_encoding_succeeds(self, tmp_path, local_registry): ): combined = xr.concat([vds1, vds2], dim="x") assert combined["foo"].shape == (8, 1) - assert combined["foo"].attrs["scale_factor"] == 0.1 - assert combined["foo"].attrs["add_offset"] == 0.0 + combined_attrs = combined["foo"].variable.data.metadata.attributes + assert combined_attrs["scale_factor"] == 0.1 + assert combined_attrs["add_offset"] == 0.0 diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 0fa19037..25797361 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -24,6 +24,7 @@ check_same_shapes_except_axes, check_same_shapes_except_on_concat_axis, ) +from virtualizarr.xarray import extract_cf_encoding_attrs if TYPE_CHECKING: from icechunk import ( @@ -530,7 +531,11 @@ def write_virtual_variable_to_icechunk( fill_value=metadata.fill_value, ) - update_attributes(arr, var.attrs, encoding=var.encoding) + update_attributes( + arr, + {**extract_cf_encoding_attrs(var), **var.attrs}, + encoding=var.encoding, + ) write_manifest_to_icechunk( store=store, diff --git a/virtualizarr/writers/kerchunk.py b/virtualizarr/writers/kerchunk.py index 9cd0782c..74177eb9 100644 --- a/virtualizarr/writers/kerchunk.py +++ b/virtualizarr/writers/kerchunk.py @@ -17,6 +17,7 @@ from virtualizarr.manifests.manifest import join from virtualizarr.types.kerchunk import KerchunkArrRefs, KerchunkStoreRefs from virtualizarr.utils import convert_v3_to_v2_metadata +from virtualizarr.xarray import extract_cf_encoding_attrs class NumpyEncoder(json.JSONEncoder): @@ -130,7 +131,7 @@ def variable_to_kerchunk_arr_refs(var: Variable, var_name: str) -> KerchunkArrRe entry["length"], ] array_v2_metadata = convert_v3_to_v2_metadata(marr.metadata) - zattrs = {**var.attrs, **var.encoding} + zattrs = {**extract_cf_encoding_attrs(var), **var.attrs, **var.encoding} else: var = encode_zarr_variable(var) try: diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index fc40b6b5..2ebca7fb 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -25,6 +25,7 @@ from virtualizarr.manifests import ManifestArray, ManifestGroup, ManifestStore from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri +from virtualizarr.manifests.utils import CF_ENCODING_ATTRS from virtualizarr.parallel import get_executor from virtualizarr.parsers.typing import Parser from virtualizarr.utils import compose @@ -37,6 +38,32 @@ ) +def extract_cf_encoding_attrs(var: xr.Variable) -> dict[str, object]: + """ + Return the CF decoding attributes hidden on the inner `ManifestArray` of + a virtual xarray Variable. + + [ManifestArray.to_virtual_variable][virtualizarr.manifests.ManifestArray.to_virtual_variable] + splits attributes by role: the CF decoding attrs in + [CF_ENCODING_ATTRS][virtualizarr.manifests.utils.CF_ENCODING_ATTRS] + stay on the inner `ManifestArray.metadata.attributes` so concat can + detect mismatches, while arbitrary attrs sit on `xr.Variable.attrs`. + Writers materializing the destination store call this to pull the CF + attrs back out so they can write them alongside the variable's other + attrs. + """ + if not isinstance(var.data, ManifestArray): + raise TypeError( + "extract_cf_encoding_attrs requires a variable wrapping a " + f"ManifestArray, got {type(var.data).__name__}" + ) + return { + k: v + for k, v in var.data.metadata.attributes.items() + if k in CF_ENCODING_ATTRS + } + + def open_virtual_datatree( url: str, registry: ObjectStoreRegistry, From d7fb31f5af14d0ea774a51f46725f842bcc7d5a1 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 26 May 2026 15:42:41 -0400 Subject: [PATCH 3/5] docs: release note for CF encoding concat fix --- docs/releases.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/releases.md b/docs/releases.md index 5476948b..34141a52 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -9,6 +9,12 @@ - `open_virtual_dataset` and `open_virtual_datatree` now populate `ds.encoding["source"]` with the normalized source URI, mirroring [`xarray.open_dataset`][]'s behaviour. Parsers that have already set `encoding["source"]` are left untouched. By [Tom Nicholas](https://github.com/TomNicholas). +### Bug fixes + +- Detect mismatched CF decoding attributes (`scale_factor`, `add_offset`, `_FillValue`, `missing_value`) when concatenating virtual datasets. Previously, `xr.concat` on two virtual datasets whose source files were CF-packed with different scale/offset values silently kept only the first array's attrs, so the surviving values were applied to every chunk after `decode_cf` — corrupting decoded values for everything sourced from non-first files. CF decoding attrs now live on the inner `ManifestArray.metadata.attributes` (where `np.concatenate` dispatch can inspect them) rather than on the wrapping `xr.Variable.attrs`, and `check_combinable_zarr_arrays` raises `ValueError` on any mismatch. Round-trip through the icechunk and kerchunk writers is unchanged. Closes [#1004](https://github.com/zarr-developers/VirtualiZarr/issues/1004). + ([#XXXX](https://github.com/zarr-developers/VirtualiZarr/pull/XXXX)). + By [Tom Nicholas](https://github.com/TomNicholas). + ## v2.6.2 (18th May 2026) Adds an `IcechunkParser` for reading existing icechunk repositories as virtual datasets without copying data, chunk-aligned indexing on `ManifestArray` (so `xarray.Dataset.isel` works end-to-end on virtual datasets), and limited sub-chunk slicing for uncompressed arrays. From 623fbcbcafac0daa2f355502544f6433aebfee77 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 May 2026 19:54:14 +0000 Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/manifests/array.py | 4 +--- virtualizarr/tests/test_parsers/test_dmrpp.py | 4 +--- virtualizarr/xarray.py | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 2272bc14..d49bb128 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -374,9 +374,7 @@ def to_virtual_variable(self) -> xr.Variable: split_metadata = utils.copy_and_replace_metadata( self.metadata, new_dimension_names=None, new_attributes=cf_attrs ) - split_marr = ManifestArray( - chunkmanifest=self.manifest, metadata=split_metadata - ) + split_marr = ManifestArray(chunkmanifest=self.manifest, metadata=split_metadata) return xr.Variable( data=split_marr, diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index 964e82be..6c36c8e5 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -601,9 +601,7 @@ def test_dmrpp_empty_scalar_warns_container(fill_value_scalar_no_chunks_nc4_url) # ManifestArray rather than on the Variable's arbitrary attrs (see # ManifestArray.to_virtual_variable). assert vds_g1["data"].attrs == {} - assert vds_g1["data"].variable.data.metadata.attributes == { - "_FillValue": -999 - } + assert vds_g1["data"].variable.data.metadata.attributes == {"_FillValue": -999} def test_dmrpp_phony_dim_naming(): diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 2ebca7fb..05976e9b 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -58,9 +58,7 @@ def extract_cf_encoding_attrs(var: xr.Variable) -> dict[str, object]: f"ManifestArray, got {type(var.data).__name__}" ) return { - k: v - for k, v in var.data.metadata.attributes.items() - if k in CF_ENCODING_ATTRS + k: v for k, v in var.data.metadata.attributes.items() if k in CF_ENCODING_ATTRS } From 7e812f89a6bf34b1f56b1b7cf1af7e5d7b6f95c3 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 26 May 2026 15:56:12 -0400 Subject: [PATCH 5/5] docs: fill in PR # in release note --- docs/releases.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/releases.md b/docs/releases.md index 34141a52..c6b0cabe 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -12,7 +12,7 @@ ### Bug fixes - Detect mismatched CF decoding attributes (`scale_factor`, `add_offset`, `_FillValue`, `missing_value`) when concatenating virtual datasets. Previously, `xr.concat` on two virtual datasets whose source files were CF-packed with different scale/offset values silently kept only the first array's attrs, so the surviving values were applied to every chunk after `decode_cf` — corrupting decoded values for everything sourced from non-first files. CF decoding attrs now live on the inner `ManifestArray.metadata.attributes` (where `np.concatenate` dispatch can inspect them) rather than on the wrapping `xr.Variable.attrs`, and `check_combinable_zarr_arrays` raises `ValueError` on any mismatch. Round-trip through the icechunk and kerchunk writers is unchanged. Closes [#1004](https://github.com/zarr-developers/VirtualiZarr/issues/1004). - ([#XXXX](https://github.com/zarr-developers/VirtualiZarr/pull/XXXX)). + ([#1005](https://github.com/zarr-developers/VirtualiZarr/pull/1005)). By [Tom Nicholas](https://github.com/TomNicholas). ## v2.6.2 (18th May 2026)