Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/releases.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
- `open_virtual_dataset` and `open_virtual_datatree` now populate `ds.encoding["source"]` with the normalized source URI, mirroring [`xarray.open_dataset`][]'s behaviour. Parsers that have already set `encoding["source"]` are left untouched.
By [Tom Nicholas](https://github.com/TomNicholas).

### Bug fixes

- Detect mismatched CF decoding attributes (`scale_factor`, `add_offset`, `_FillValue`, `missing_value`) when concatenating virtual datasets. Previously, `xr.concat` on two virtual datasets whose source files were CF-packed with different scale/offset values silently kept only the first array's attrs, so the surviving values were applied to every chunk after `decode_cf` — corrupting decoded values for everything sourced from non-first files. CF decoding attrs now live on the inner `ManifestArray.metadata.attributes` (where `np.concatenate` dispatch can inspect them) rather than on the wrapping `xr.Variable.attrs`, and `check_combinable_zarr_arrays` raises `ValueError` on any mismatch. Round-trip through the icechunk and kerchunk writers is unchanged. Closes [#1004](https://github.com/zarr-developers/VirtualiZarr/issues/1004).
([#1005](https://github.com/zarr-developers/VirtualiZarr/pull/1005)).
By [Tom Nicholas](https://github.com/TomNicholas).

## v2.6.2 (18th May 2026)

Adds an `IcechunkParser` for reading existing icechunk repositories as virtual datasets without copying data, chunk-aligned indexing on `ManifestArray` (so `xarray.Dataset.isel` works end-to-end on virtual datasets), and limited sub-chunk slicing for uncompressed arrays.
Expand Down
38 changes: 28 additions & 10 deletions virtualizarr/manifests/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,10 +338,19 @@ def to_virtual_variable(self) -> xr.Variable:
Create a "virtual" xarray.Variable containing the contents of one zarr array.

The returned variable will be "virtual", i.e. it will wrap a single ManifestArray object.

Attributes are split by role:

- CF decoding attributes (`scale_factor`, `add_offset`, `_FillValue`,
`missing_value`) stay on the inner ManifestArray. This is the
canonical home: a value mismatch across concat inputs would
silently corrupt decoded chunks, so it must be detectable at the
ManifestArray layer where np.concatenate dispatches.
- All other attributes are treated as arbitrary metadata and move
onto the wrapping `xr.Variable`, matching xarray's data model
for attrs.
"""

# The xarray data model stores dimension names and arbitrary extra metadata outside of the wrapped array class,
# so to avoid that information being duplicated we strip it from the ManifestArray before wrapping it.
if self.metadata.dimension_names is not None:
dims = self.metadata.dimension_names
elif self.ndim == 0:
Expand All @@ -350,16 +359,25 @@ def to_virtual_variable(self) -> xr.Variable:
raise ValueError(
f"Cannot create virtual variable from {self.ndim}-dimensional array without dimension names."
)
attrs = self.metadata.attributes
stripped_metadata = utils.copy_and_replace_metadata(
self.metadata, new_dimension_names=None, new_attributes={}
)
stripped_marr = ManifestArray(
chunkmanifest=self.manifest, metadata=stripped_metadata

cf_attrs = {
k: v
for k, v in self.metadata.attributes.items()
if k in utils.CF_ENCODING_ATTRS
}
var_attrs = {
k: v
for k, v in self.metadata.attributes.items()
if k not in utils.CF_ENCODING_ATTRS
}

split_metadata = utils.copy_and_replace_metadata(
self.metadata, new_dimension_names=None, new_attributes=cf_attrs
)
split_marr = ManifestArray(chunkmanifest=self.manifest, metadata=split_metadata)

return xr.Variable(
data=stripped_marr,
data=split_marr,
dims=dims,
attrs=attrs,
attrs=var_attrs,
)
46 changes: 46 additions & 0 deletions virtualizarr/manifests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,46 @@ def check_compatible_encodings(encoding1, encoding2):
)


# CF-convention attributes that control how packed chunk values are decoded.
# These live on `ManifestArray.metadata.attributes` rather than on the
# wrapping `xr.Variable.attrs`, so that mismatched values across concat
# inputs can be detected by `check_combinable_zarr_arrays` before the
# silent "first array wins" inheritance corrupts decoded values.
# Kept in sync with `virtualizarr.writers.icechunk.ENCODING_KEYS`.
CF_ENCODING_ATTRS = frozenset(
{"_FillValue", "missing_value", "scale_factor", "add_offset"}
)


def check_same_cf_encoding_attrs(
attrs_list: list[dict[str, Any]],
) -> None:
"""Check that CF decoding-relevant attributes agree across all arrays."""
first_attrs, *other_attrs_list = attrs_list
for other_attrs in other_attrs_list:
for key in CF_ENCODING_ATTRS:
in_first = key in first_attrs
in_other = key in other_attrs
if in_first != in_other:
present, missing = (
("first", "other") if in_first else ("other", "first")
)
raise ValueError(
f"Cannot concatenate arrays with inconsistent CF encoding: "
f"attribute {key!r} is present on the {present} array but "
f"missing on the {missing} array. Mixing chunks with and "
f"without {key} would silently corrupt decoded values."
)
if in_first and first_attrs[key] != other_attrs[key]:
raise ValueError(
f"Cannot concatenate arrays with inconsistent CF encoding "
f"attribute {key!r}: {first_attrs[key]!r} vs "
f"{other_attrs[key]!r}. Applying one value's decoding to "
f"chunks packed with the other would silently corrupt "
f"decoded values."
)


def check_same_codecs(codecs: list[Any]) -> None:
first_codec, *other_codecs = codecs
for codec in other_codecs:
Expand Down Expand Up @@ -325,6 +365,8 @@ def check_combinable_zarr_arrays(
The downside of the ManifestArray approach compared to the VirtualZarrArray concatenation proposal is that
the result must also be a single valid zarr array, implying that the inputs must have the same dtype, codec etc.
"""
arrays = list(arrays)

check_same_dtypes([arr.dtype for arr in arrays])

# Can't combine different codecs in one manifest
Expand All @@ -334,6 +376,10 @@ def check_combinable_zarr_arrays(
# Would require variable-length chunks ZEP
check_same_chunk_shapes([arr.chunks for arr in arrays])

# Mismatched CF decoding attrs across inputs would silently corrupt values
# once the concatenated array is decoded by xarray.
check_same_cf_encoding_attrs([dict(arr.metadata.attributes) for arr in arrays])


def check_compatible_arrays(
ma: "ManifestArray", existing_array: "Array", append_axis: int
Expand Down
6 changes: 5 additions & 1 deletion virtualizarr/tests/test_parsers/test_dmrpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,11 @@ def test_dmrpp_empty_scalar_warns_container(fill_value_scalar_no_chunks_nc4_url)
with pytest.warns(UserWarning):
parsed_vds = parsed_dmrpp.parse_dataset(object_store=store)
vds_g1 = parsed_vds.to_virtual_dataset()
assert vds_g1["data"].attrs == {"_FillValue": -999}
# _FillValue is a CF decoding attribute, which lives on the inner
# ManifestArray rather than on the Variable's arbitrary attrs (see
# ManifestArray.to_virtual_variable).
assert vds_g1["data"].attrs == {}
assert vds_g1["data"].variable.data.metadata.attributes == {"_FillValue": -999}


def test_dmrpp_phony_dim_naming():
Expand Down
91 changes: 91 additions & 0 deletions virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,3 +228,94 @@ def test_netcdf_over_https():
):
np.testing.assert_allclose(ds["z"].min().to_numpy(), -6)
np.testing.assert_allclose(ds["z"].max().to_numpy(), 817)


def _write_packed_hdf5(
path, *, x_start, x_len, scale_factor, add_offset, fill_value=-9999
):
"""Write a small CF-packed int16 HDF5 file with the given scale/offset attrs."""
with h5py.File(path, "w") as f:
data = np.arange(x_start, x_start + x_len, dtype="int16").reshape(x_len, 1)
d = f.create_dataset("foo", data=data, chunks=(x_len, 1))
d.attrs["scale_factor"] = np.float64(scale_factor)
d.attrs["add_offset"] = np.float64(add_offset)
d.attrs["_FillValue"] = np.int16(fill_value)
x = f.create_dataset(
"x", data=np.arange(x_start, x_start + x_len, dtype="int32")
)
x.make_scale("x")
d.dims[0].attach_scale(x)
y = f.create_dataset("y", data=np.arange(1, dtype="int32"))
y.make_scale("y")
d.dims[1].attach_scale(y)


@requires_hdf5plugin
@requires_imagecodecs
class TestConcatMismatchedCFEncoding:
"""
Concatenating virtual datasets whose source files were CF-packed with
different scale_factor / add_offset / _FillValue must not silently keep
only the first file's attrs — doing so corrupts decoded values for every
chunk that did not come from the first file.
"""

def test_concat_mismatched_scale_factor_raises(self, tmp_path, local_registry):
p1 = tmp_path / "a.nc"
p2 = tmp_path / "b.nc"
_write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0)
_write_packed_hdf5(p2, x_start=4, x_len=4, scale_factor=0.01, add_offset=0.0)

parser = HDFParser()
with (
open_virtual_dataset(
url=f"file://{p1}", parser=parser, registry=local_registry
) as vds1,
open_virtual_dataset(
url=f"file://{p2}", parser=parser, registry=local_registry
) as vds2,
):
with pytest.raises(ValueError, match="scale_factor"):
xr.concat([vds1, vds2], dim="x")

def test_concat_mismatched_add_offset_raises(self, tmp_path, local_registry):
p1 = tmp_path / "a.nc"
p2 = tmp_path / "b.nc"
_write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0)
_write_packed_hdf5(p2, x_start=4, x_len=4, scale_factor=0.1, add_offset=100.0)

parser = HDFParser()
with (
open_virtual_dataset(
url=f"file://{p1}", parser=parser, registry=local_registry
) as vds1,
open_virtual_dataset(
url=f"file://{p2}", parser=parser, registry=local_registry
) as vds2,
):
with pytest.raises(ValueError, match="add_offset"):
xr.concat([vds1, vds2], dim="x")

def test_concat_matching_cf_encoding_succeeds(self, tmp_path, local_registry):
# positive control: identical CF encoding must still concatenate cleanly,
# and the decoding attrs must survive on the concatenated ManifestArray
# so that the next writer can materialize them in the destination store.
p1 = tmp_path / "a.nc"
p2 = tmp_path / "b.nc"
_write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0)
_write_packed_hdf5(p2, x_start=4, x_len=4, scale_factor=0.1, add_offset=0.0)

parser = HDFParser()
with (
open_virtual_dataset(
url=f"file://{p1}", parser=parser, registry=local_registry
) as vds1,
open_virtual_dataset(
url=f"file://{p2}", parser=parser, registry=local_registry
) as vds2,
):
combined = xr.concat([vds1, vds2], dim="x")
assert combined["foo"].shape == (8, 1)
combined_attrs = combined["foo"].variable.data.metadata.attributes
assert combined_attrs["scale_factor"] == 0.1
assert combined_attrs["add_offset"] == 0.0
7 changes: 6 additions & 1 deletion virtualizarr/writers/icechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
check_same_shapes_except_axes,
check_same_shapes_except_on_concat_axis,
)
from virtualizarr.xarray import extract_cf_encoding_attrs

if TYPE_CHECKING:
from icechunk import (
Expand Down Expand Up @@ -530,7 +531,11 @@ def write_virtual_variable_to_icechunk(
fill_value=metadata.fill_value,
)

update_attributes(arr, var.attrs, encoding=var.encoding)
update_attributes(
arr,
{**extract_cf_encoding_attrs(var), **var.attrs},
encoding=var.encoding,
)

write_manifest_to_icechunk(
store=store,
Expand Down
3 changes: 2 additions & 1 deletion virtualizarr/writers/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from virtualizarr.manifests.manifest import join
from virtualizarr.types.kerchunk import KerchunkArrRefs, KerchunkStoreRefs
from virtualizarr.utils import convert_v3_to_v2_metadata
from virtualizarr.xarray import extract_cf_encoding_attrs


class NumpyEncoder(json.JSONEncoder):
Expand Down Expand Up @@ -130,7 +131,7 @@ def variable_to_kerchunk_arr_refs(var: Variable, var_name: str) -> KerchunkArrRe
entry["length"],
]
array_v2_metadata = convert_v3_to_v2_metadata(marr.metadata)
zattrs = {**var.attrs, **var.encoding}
zattrs = {**extract_cf_encoding_attrs(var), **var.attrs, **var.encoding}
else:
var = encode_zarr_variable(var)
try:
Expand Down
25 changes: 25 additions & 0 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from virtualizarr.manifests import ManifestArray, ManifestGroup, ManifestStore
from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri
from virtualizarr.manifests.utils import CF_ENCODING_ATTRS
from virtualizarr.parallel import get_executor
from virtualizarr.parsers.typing import Parser
from virtualizarr.utils import compose
Expand All @@ -37,6 +38,30 @@
)


def extract_cf_encoding_attrs(var: xr.Variable) -> dict[str, object]:
"""
Return the CF decoding attributes hidden on the inner `ManifestArray` of
a virtual xarray Variable.

[ManifestArray.to_virtual_variable][virtualizarr.manifests.ManifestArray.to_virtual_variable]
splits attributes by role: the CF decoding attrs in
[CF_ENCODING_ATTRS][virtualizarr.manifests.utils.CF_ENCODING_ATTRS]
stay on the inner `ManifestArray.metadata.attributes` so concat can
detect mismatches, while arbitrary attrs sit on `xr.Variable.attrs`.
Writers materializing the destination store call this to pull the CF
attrs back out so they can write them alongside the variable's other
attrs.
"""
if not isinstance(var.data, ManifestArray):
raise TypeError(
"extract_cf_encoding_attrs requires a variable wrapping a "
f"ManifestArray, got {type(var.data).__name__}"
)
return {
k: v for k, v in var.data.metadata.attributes.items() if k in CF_ENCODING_ATTRS
}


def open_virtual_datatree(
url: str,
registry: ObjectStoreRegistry,
Expand Down
Loading