Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/releases.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
- `open_virtual_dataset` and `open_virtual_datatree` now populate `ds.encoding["source"]` with the normalized source URI, mirroring [`xarray.open_dataset`][]'s behaviour. Parsers that have already set `encoding["source"]` are left untouched.
By [Tom Nicholas](https://github.com/TomNicholas).

### Bug fixes

- The HDF parser now expresses CF `scale_factor`/`add_offset` packing as the zarr `scale_offset` + `cast_value` codecs (stripping those attributes) instead of leaving them as attributes. Files packed with different parameters therefore get different codec pipelines, so `check_same_codecs` refuses to concatenate them rather than silently keeping only the first file's parameters and corrupting decoded values for chunks sourced from other files. No concatenation code was changed. The kerchunk (zarr v2) writer reconstructs the equivalent `scale_factor`/`add_offset` attributes on write. Note that codec-based decoding (`packed / scale`) agrees with xarray's `decode_cf` (`packed * scale_factor`) only to within ~1 ULP. Closes [#1004](https://github.com/zarr-developers/VirtualiZarr/issues/1004).
By [Tom Nicholas](https://github.com/TomNicholas).

### Documentation

- Document that virtual concatenation also requires homogeneous CF encoding (`scale_factor`/`add_offset`) across files — xarray's default attribute-merging silently drops mismatched values and produces incorrectly-decoded data on read. Added a new FAQ bullet and a warning admonition under "Combining virtual datasets" in the usage docs. See [#1004](https://github.com/zarr-developers/VirtualiZarr/issues/1004).
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ hdf = [
"hdf5plugin",
"imagecodecs",
"imagecodecs-numcodecs==2024.6.1",
# CF scale/offset packing is expressed via the zarr `scale_offset` +
# `cast_value` codecs; the latter needs its Rust backend at parse time.
"cast-value-rs",
]

zarr = ["arro3-core"]
Expand Down
71 changes: 71 additions & 0 deletions virtualizarr/parsers/hdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from numcodecs.abc import Codec
from numcodecs.fixedscaleoffset import FixedScaleOffset
from xarray.coding.variables import _choose_float_dtype
from zarr.codecs import CastValue, ScaleOffset

from virtualizarr.utils import soft_import

Expand Down Expand Up @@ -68,6 +69,13 @@ class CFCodec(TypedDict):
codec: Codec


class CFCodecs(TypedDict):
target_dtype: np.dtype
codecs: list[dict]
scale: float
offset: float


def _filter_to_codec(
filter_id: str, filter_properties: Union[int, None, Tuple] = None
) -> Codec:
Expand Down Expand Up @@ -170,6 +178,69 @@ def cfcodec_from_dataset(dataset: Dataset) -> Codec | None:
return None


def cf_codecs_from_dataset(dataset: Dataset) -> CFCodecs | None:
"""
Express an h5py dataset's CF scale/offset packing as zarr v3 codecs.

CF-packed data is stored as integers and decoded as
``unpacked = packed * scale_factor + add_offset``. We represent that with
two zarr v3 codecs applied to the (decoded) float array:

- ``scale_offset`` does the arithmetic. Its decode is ``x / scale + offset``,
so ``scale = 1 / scale_factor`` and ``offset = add_offset``.
- ``cast_value`` casts between the decoded float dtype and the stored
integer dtype.

Returning these as codecs (rather than leaving ``scale_factor`` /
``add_offset`` as attributes) means two arrays packed with different
parameters get different codec pipelines, so ``check_same_codecs`` refuses
to concatenate them instead of silently corrupting decoded values
(see https://github.com/zarr-developers/VirtualiZarr/issues/1004).

Parameters
----------
dataset
An h5py dataset.

Returns
-------
CFCodecs or None
``None`` when the dataset has no scale/offset packing.
"""
attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
mapping = {}
if "scale_factor" in attributes:
try:
scale_factor = attributes["scale_factor"][0]
except IndexError:
scale_factor = attributes["scale_factor"]
mapping["scale_factor"] = float(1 / scale_factor)
else:
mapping["scale_factor"] = 1
if "add_offset" in attributes:
try:
offset = attributes["add_offset"][0]
except IndexError:
offset = attributes["add_offset"]
mapping["add_offset"] = float(offset)
else:
mapping["add_offset"] = 0
if mapping["scale_factor"] == 1 and mapping["add_offset"] == 0:
return None

target_dtype = np.dtype(_choose_float_dtype(dtype=dataset.dtype, mapping=mapping))
scale_offset = ScaleOffset(
scale=mapping["scale_factor"], offset=mapping["add_offset"]
)
cast_value = CastValue(data_type=dataset.dtype.name)
return CFCodecs(
target_dtype=target_dtype,
codecs=[scale_offset.to_dict(), cast_value.to_dict()],
scale=mapping["scale_factor"],
offset=mapping["add_offset"],
)


def codecs_from_dataset(dataset: Dataset) -> List[Codec]:
"""
Extracts a list of numcodecs from an h5py dataset
Expand Down
33 changes: 19 additions & 14 deletions virtualizarr/parsers/hdf/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
ManifestStore,
)
from virtualizarr.manifests.utils import create_v3_array_metadata
from virtualizarr.parsers.hdf.filters import codecs_from_dataset
from virtualizarr.parsers.hdf.filters import cf_codecs_from_dataset, codecs_from_dataset
from virtualizarr.parsers.typing import ReaderFactory
from virtualizarr.parsers.utils import encode_cf_fill_value
from virtualizarr.types import ChunkKey
Expand Down Expand Up @@ -64,23 +64,28 @@ def _construct_manifest_array(
attrs = _extract_attrs(dataset)
dtype = dataset.dtype

# Temporarily disable use CF->Codecs - TODO re-enable in subsequent PR.
# cfcodec = cfcodec_from_dataset(dataset)
# if cfcodec:
# codecs.insert(0, cfcodec["codec"])
# dtype = cfcodec["target_dtype"]
# attrs.pop("scale_factor", None)
# attrs.pop("add_offset", None)
# else:
# dtype = dataset.dtype
codec_configs = [zarr_codec_config_to_v3(codec.get_config()) for codec in codecs]
fill_value = dataset.fillvalue.item()

# Express CF scale/offset packing as zarr codecs rather than attributes, so
# arrays packed with different parameters get different codec pipelines and
# cannot be silently concatenated. The scale_offset/cast_value codecs run on
# the decoded float array, so they go before the file's byte-level filters.
# See https://github.com/zarr-developers/VirtualiZarr/issues/1004.
cfcodecs = cf_codecs_from_dataset(dataset)
if cfcodecs is not None:
codec_configs = cfcodecs["codecs"] + codec_configs
dtype = cfcodecs["target_dtype"]
# The array is now logically float (decoded), so the fill value — stored
# in the packed integer domain — must be decoded to match, mirroring the
# ScaleOffset codec's decode of `packed / scale + offset`.
fill_value = fill_value / cfcodecs["scale"] + cfcodecs["offset"]
attrs.pop("scale_factor", None)
attrs.pop("add_offset", None)

if "_FillValue" in attrs:
encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype)
attrs["_FillValue"] = encoded_cf_fill_value

codec_configs = [zarr_codec_config_to_v3(codec.get_config()) for codec in codecs]

fill_value = dataset.fillvalue.item()
dims = tuple(_dataset_dims(dataset, group=group))
metadata = create_v3_array_metadata(
shape=dataset.shape,
Expand Down
110 changes: 110 additions & 0 deletions virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@
from virtualizarr.parsers import HDFParser
from virtualizarr.tests import (
requires_hdf5plugin,
requires_icechunk,
requires_imagecodecs,
)
from virtualizarr.tests.test_integration import roundtrip_as_in_memory_icechunk
from virtualizarr.tests.utils import manifest_store_from_hdf_url


Expand Down Expand Up @@ -228,3 +230,111 @@ def test_netcdf_over_https():
):
np.testing.assert_allclose(ds["z"].min().to_numpy(), -6)
np.testing.assert_allclose(ds["z"].max().to_numpy(), 817)


def _write_packed_hdf5(
path, *, x_start, x_len, scale_factor, add_offset, fill_value=-9999
):
"""Write a small CF-packed int16 HDF5 file with the given scale/offset attrs."""
with h5py.File(path, "w") as f:
data = np.arange(x_start, x_start + x_len, dtype="int16").reshape(x_len, 1)
d = f.create_dataset("foo", data=data, chunks=(x_len, 1))
d.attrs["scale_factor"] = np.float64(scale_factor)
d.attrs["add_offset"] = np.float64(add_offset)
d.attrs["_FillValue"] = np.int16(fill_value)
x = f.create_dataset(
"x", data=np.arange(x_start, x_start + x_len, dtype="int32")
)
x.make_scale("x")
d.dims[0].attach_scale(x)
y = f.create_dataset("y", data=np.arange(1, dtype="int32"))
y.make_scale("y")
d.dims[1].attach_scale(y)


@requires_hdf5plugin
@requires_imagecodecs
class TestConcatMismatchedCFEncoding:
"""
The HDF parser expresses CF scale_factor / add_offset packing as zarr
`scale_offset` + `cast_value` codecs rather than as attributes. Files packed
with different parameters therefore get different codec pipelines, so the
existing `check_same_codecs` refuses to concatenate them instead of silently
keeping only the first file's parameters and corrupting decoded values for
every chunk that did not come from the first file (see
https://github.com/zarr-developers/VirtualiZarr/issues/1004).
"""

def test_concat_mismatched_scale_factor_raises(self, tmp_path, local_registry):
p1 = tmp_path / "a.nc"
p2 = tmp_path / "b.nc"
_write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0)
_write_packed_hdf5(p2, x_start=4, x_len=4, scale_factor=0.01, add_offset=0.0)

parser = HDFParser()
with (
open_virtual_dataset(
url=f"file://{p1}", parser=parser, registry=local_registry
) as vds1,
open_virtual_dataset(
url=f"file://{p2}", parser=parser, registry=local_registry
) as vds2,
):
# the codec mismatch surfaces as different ScaleOffset codecs:
# scale = 1 / scale_factor, so 0.1 -> 10.0 and 0.01 -> 100.0
with pytest.raises(NotImplementedError, match="ScaleOffset") as excinfo:
xr.concat([vds1, vds2], dim="x")
assert "scale=10.0" in str(excinfo.value)
assert "scale=100.0" in str(excinfo.value)

def test_concat_mismatched_add_offset_raises(self, tmp_path, local_registry):
p1 = tmp_path / "a.nc"
p2 = tmp_path / "b.nc"
_write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0)
_write_packed_hdf5(p2, x_start=4, x_len=4, scale_factor=0.1, add_offset=100.0)

parser = HDFParser()
with (
open_virtual_dataset(
url=f"file://{p1}", parser=parser, registry=local_registry
) as vds1,
open_virtual_dataset(
url=f"file://{p2}", parser=parser, registry=local_registry
) as vds2,
):
# add_offset maps directly onto the ScaleOffset codec's offset
with pytest.raises(NotImplementedError, match="ScaleOffset") as excinfo:
xr.concat([vds1, vds2], dim="x")
assert "offset=100.0" in str(excinfo.value)

@requires_icechunk
def test_concat_matching_cf_encoding_roundtrips(self, tmp_path, local_registry):
# positive control: identical CF encoding must concatenate cleanly, and a
# write -> reopen -> decode round-trip must yield the correct unpacked
# values for chunks sourced from *both* files (packed = arange,
# decoded = packed * 0.1).
p1 = tmp_path / "a.nc"
p2 = tmp_path / "b.nc"
_write_packed_hdf5(p1, x_start=0, x_len=4, scale_factor=0.1, add_offset=0.0)
_write_packed_hdf5(p2, x_start=4, x_len=4, scale_factor=0.1, add_offset=0.0)

parser = HDFParser()
with (
open_virtual_dataset(
url=f"file://{p1}", parser=parser, registry=local_registry
) as vds1,
open_virtual_dataset(
url=f"file://{p2}", parser=parser, registry=local_registry
) as vds2,
):
combined = xr.concat([vds1, vds2], dim="x")
assert combined["foo"].shape == (8, 1)
# scale/offset live on the codec pipeline, not as attributes
assert (
"scale_factor" not in combined["foo"].variable.data.metadata.attributes
)
assert "add_offset" not in combined["foo"].variable.data.metadata.attributes

roundtrip = roundtrip_as_in_memory_icechunk(combined, tmp_path)
expected = (np.arange(0, 8) * 0.1).reshape(8, 1)
np.testing.assert_allclose(roundtrip["foo"].values, expected)
20 changes: 18 additions & 2 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,15 @@ def test_loadable_variables(

for name, var in ds.variables.items():
if name in actual_loadable_variables:
xrt.assert_identical(vds.variables[name], ds.variables[name])
if "scale_factor" in var.encoding or "add_offset" in var.encoding:
# CF scale/offset packing is now decoded by the zarr
# scale_offset codec (packed / scale) rather than by
# xarray's decode_cf (packed * scale_factor). The two
# agree to within ~1 ULP but are not bit-identical, so
# the decoded values are compared with assert_allclose.
xrt.assert_allclose(vds.variables[name], ds.variables[name])
else:
xrt.assert_identical(vds.variables[name], ds.variables[name])

def test_group_kwarg_not_a_group(self, hdf5_groups_file, local_registry):
parser = HDFParser(group="doesnt_exist")
Expand All @@ -846,7 +854,15 @@ def test_group_kwarg(self, hdf5_groups_file, local_registry):
):
for name in full_ds.variables:
if name in vars_to_load:
xrt.assert_identical(vds.variables[name], full_ds.variables[name])
ref = full_ds.variables[name]
if "scale_factor" in ref.encoding or "add_offset" in ref.encoding:
# CF scale/offset packing is decoded by the zarr
# scale_offset codec (packed / scale) rather than by
# xarray's decode_cf (packed * scale_factor); these agree
# to within ~1 ULP but are not bit-identical.
xrt.assert_allclose(vds.variables[name], ref)
else:
xrt.assert_identical(vds.variables[name], ref)

def test_open_dataset_with_empty(self, hdf5_empty, local_registry):
parser = HDFParser()
Expand Down
Loading
Loading