diff --git a/docs/releases.md b/docs/releases.md index c63829b9..011e3f81 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -11,6 +11,12 @@ - `open_virtual_dataset` and `open_virtual_datatree` now populate `ds.encoding["source"]` with the normalized source URI, mirroring [`xarray.open_dataset`][]'s behaviour. Parsers that have already set `encoding["source"]` are left untouched. By [Tom Nicholas](https://github.com/TomNicholas). +### Bug fixes + +- `open_virtual_mfdataset(..., combine="nested")` now raises a clear, actionable `ValueError` when a `concat_dim` is a scalar (non-dimension) coordinate that is left virtual (e.g. when each file holds a single timestep so `time` is a scalar). Previously this produced an opaque `TypeError: Could not find a Chunk Manager which recognises type ManifestArray` from xarray, because concatenating along a new dimension requires building an index from the coordinate's values, which a virtual `ManifestArray` cannot provide. The new error tells the user to load the coordinate via `loadable_variables`. Related to [#114](https://github.com/zarr-developers/VirtualiZarr/issues/114). + ([#1015](https://github.com/zarr-developers/VirtualiZarr/pull/1015)). + By [Max Jones](https://github.com/maxrjones). + ### Documentation - Document that virtual concatenation also requires homogeneous CF encoding (`scale_factor`/`add_offset`) across files — xarray's default attribute-merging silently drops mismatched values and produces incorrectly-decoded data on read. Added a new FAQ bullet and a warning admonition under "Combining virtual datasets" in the usage docs. See [#1004](https://github.com/zarr-developers/VirtualiZarr/issues/1004). diff --git a/docs/usage.md b/docs/usage.md index a162fcf5..d182f18e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -345,6 +345,28 @@ combined_vds = open_virtual_mfdataset( We passed `combine='nested'` to specify that we want the datasets to be combined in the order they appear, using `xr.combine_nested` under the hood. +!!! warning "Concatenating along a scalar coordinate" + The examples above concatenate along `time`, which is already a *dimension* in each file (so each file holds many timesteps). Concatenating along an existing dimension just stacks the virtual references end-to-end and requires no in-memory data. + + A common alternative layout stores **one timestep per file**, so that `time` is a *scalar* (0-dimensional) coordinate rather than a dimension. To combine these, xarray must promote that scalar to a new dimension and build an index for it, which requires the coordinate's values in memory. Because `time` is left virtual by default (the automatic loading only covers 1D dimension coordinates), the combine will fail unless you load it explicitly: + + ```python + combined_vds = open_virtual_mfdataset( + urls, + registry=registry, + parser=parser, + combine="nested", + concat_dim="time", + loadable_variables=["time"], # load the scalar concat coordinate into memory + coords="minimal", + compat="override", + ) + ``` + + Note that passing `loadable_variables` *replaces* the default set, so the other dimension coordinates (e.g. `lat`/`lon`) are no longer loaded automatically. The `coords="minimal"` and `compat="override"` arguments (see the note above) stop xarray from trying to load those still-virtual coordinates in order to compare them. Alternatively, list every coordinate you want in memory, e.g. `loadable_variables=["time", "lat", "lon"]`. + + More generally, any coordinate you concatenate along that is *not already a dimension* in each source must be listed in `loadable_variables`. + ### Ordering by coordinate values If you're happy to load 1D dimension coordinates into memory, you can use their values to do the ordering for you! diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 356feaf7..16adc19a 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -29,6 +29,7 @@ slow_test, ) from virtualizarr.tests.utils import obstore_http, obstore_s3 +from virtualizarr.xarray import _raise_on_virtual_scalar_concat_dim def test_wrapping(array_v3_metadata): @@ -968,6 +969,118 @@ def test_parallel_open( ) xrt.assert_identical(combined_vds, expected_vds) + def test_nested_concat_dim_scalar_virtual_coord_raises( + self, tmp_path, local_registry + ): + # When each file holds a single timestep, `time` is a scalar coordinate that + # is left virtual by default. xarray.concat must build an index for the new + # dimension, which can't be done from a virtual ManifestArray. Raise a clear, + # actionable error pointing at loadable_variables instead of a cryptic + # chunk-manager TypeError. + filepath1 = tmp_path / "scalar_time1.nc" + filepath2 = tmp_path / "scalar_time2.nc" + with xr.tutorial.open_dataset("air_temperature") as ds: + ds.isel(time=0).to_netcdf(filepath1, format="NETCDF4") + ds.isel(time=1).to_netcdf(filepath2, format="NETCDF4") + + parser = HDFParser() + with pytest.raises(ValueError, match="loadable_variables"): + open_virtual_mfdataset( + [str(filepath1), str(filepath2)], + registry=local_registry, + parser=parser, + combine="nested", + concat_dim="time", + ) + + # ...and that loading the scalar concat coordinate makes it work. The other + # coordinates (lat/lon) are left virtual here, so we also pass + # coords="minimal"/compat="override" to avoid xarray loading them for + # comparison (the documented pattern for combining virtual datasets). + combined_vds = open_virtual_mfdataset( + [str(filepath1), str(filepath2)], + registry=local_registry, + parser=parser, + combine="nested", + concat_dim="time", + loadable_variables=["time"], + coords="minimal", + compat="override", + ) + assert combined_vds.sizes["time"] == 2 + + def test_nested_concat_dim_none_merges_without_raising( + self, netcdf4_file, local_registry + ): + # combine="nested" with concat_dim=None is the merge-only path: datasets are + # merged rather than concatenated, so no new dimension/index is built and the + # scalar-concat guard must not fire. Merging a file with itself shares an + # identical grid, so the virtual variables need no reindexing; compat="override" + # avoids comparing (and thus loading) the duplicate virtual data variable. + parser = HDFParser() + + combined_vds = open_virtual_mfdataset( + [netcdf4_file, netcdf4_file], + registry=local_registry, + parser=parser, + combine="nested", + concat_dim=None, + compat="override", + ) + + with open_virtual_dataset( + url=netcdf4_file, registry=local_registry, parser=parser + ) as expected_vds: + xrt.assert_identical(combined_vds, expected_vds) + + +class TestRaiseOnVirtualScalarConcatDim: + """Unit tests for the pre-combine guard against virtual scalar concat dimensions.""" + + def _scalar_virtual_coord_ds(self, array_v3_metadata): + metadata = array_v3_metadata(chunks=(), shape=()) + manifest = ChunkManifest( + entries={"0": {"path": "/foo.nc", "offset": 0, "length": 8}} + ) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + return xr.Dataset(coords=xr.Coordinates({"time": ((), marr)}, indexes={})) + + def test_raises_for_scalar_virtual_concat_dim(self, array_v3_metadata): + ds = self._scalar_virtual_coord_ds(array_v3_metadata) + with pytest.raises(ValueError, match="loadable_variables"): + _raise_on_virtual_scalar_concat_dim([ds], ["time"]) + + def test_no_raise_for_loaded_scalar_concat_dim(self): + ds = xr.Dataset(coords={"time": ((), np.array(5.0))}) + _raise_on_virtual_scalar_concat_dim([ds], ["time"]) + + def test_no_raise_for_existing_virtual_dim_coord(self, array_v3_metadata): + # Concatenating along an existing dimension does not build a new index, so a + # virtual dimension coordinate is fine. + metadata = array_v3_metadata(chunks=(2,), shape=(2,)) + manifest = ChunkManifest( + entries={"0": {"path": "/foo.nc", "offset": 0, "length": 16}} + ) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + ds = xr.Dataset(coords=xr.Coordinates({"time": (("time",), marr)}, indexes={})) + _raise_on_virtual_scalar_concat_dim([ds], ["time"]) + + def test_no_raise_for_non_string_or_absent_concat_dim(self): + ds = xr.Dataset(coords={"time": ((), np.array(5.0))}) + _raise_on_virtual_scalar_concat_dim([ds], [None]) + _raise_on_virtual_scalar_concat_dim([ds], ["nonexistent"]) + + def test_raises_when_only_one_dataset_is_offending(self, array_v3_metadata): + # The dimension may be scalar+virtual in only some of the files; a single + # offending source is enough for concat to fail. + good = xr.Dataset(coords={"time": ((), np.array(5.0))}) + bad = self._scalar_virtual_coord_ds(array_v3_metadata) + with pytest.raises(ValueError, match="loadable_variables"): + _raise_on_virtual_scalar_concat_dim([good, bad], ["time"]) + + def test_no_raise_for_empty_datasets(self): + _raise_on_virtual_scalar_concat_dim([], ["time"]) + def test_drop_variables(netcdf4_file, local_registry): parser = HDFParser() diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index fc40b6b5..a44bab92 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -383,6 +383,7 @@ def open_virtual_mfdataset( # Combine all datasets, closing them in case of a ValueError try: if combine == "nested": + _raise_on_virtual_scalar_concat_dim(virtual_datasets, concat_dim) # Combined nested list by successive concat and merge operations # along each dimension, using structure given by "ids" combined_vds = _nested_combine( @@ -430,6 +431,59 @@ def open_virtual_mfdataset( return combined_vds +def _raise_on_virtual_scalar_concat_dim( + virtual_datasets: Sequence[Dataset], + concat_dims: ( + str + | DataArray + | Index + | Sequence[str] + | Sequence[DataArray] + | Sequence[Index] + | None + ), +) -> None: + """Raise a clear error when a concatenation dimension is an unloaded scalar coordinate. + + When ``concat_dim`` names a scalar (non-dimension) coordinate, ``xarray.concat`` + promotes it to a new dimension and builds a pandas index from its values. That + requires materializing the coordinate as a numpy array, which is impossible for a + virtual [ManifestArray][virtualizarr.manifests.ManifestArray]; the result is an + opaque ``TypeError`` from xarray's chunk-manager lookup. Such a coordinate must + instead be loaded into memory via ``loadable_variables``. + + Index creation happens in ``expand_dims`` on each dataset individually, so a single + offending source is enough to fail the concat. We therefore check every dataset, not + just the first, since the datasets may be heterogeneous (e.g. the dimension could be + scalar in only one file). + + Concatenating along an *existing* dimension does not create a new index, so a virtual + dimension coordinate is fine in that case and is left untouched here. + """ + if concat_dims is None: + return + dims: Sequence[Any] = ( + [concat_dims] + if isinstance(concat_dims, str | DataArray | Index) + else concat_dims + ) + concat_dim_names = [dim for dim in dims if isinstance(dim, str)] + for ds in virtual_datasets: + for dim in concat_dim_names: + if dim in ds.dims: + continue + var = ds.variables.get(dim) + if var is not None and isinstance(var.data, ManifestArray): + raise ValueError( + f"Cannot concatenate along {dim!r} because it is a scalar " + "coordinate that has not been loaded into memory in at least one " + "source. xarray builds an index for the new dimension when " + "concatenating, which requires the coordinate's values, but it is " + "backed by a virtual ManifestArray. Load it by passing " + f"loadable_variables=[{dim!r}, ...] to open_virtual_mfdataset." + ) + + def construct_fully_virtual_dataset( virtual_vars: Mapping[str, xr.Variable], coord_names: Iterable[str] | None = None,