diff --git a/conftest.py b/conftest.py index 2c9d1c10..1538e60c 100644 --- a/conftest.py +++ b/conftest.py @@ -356,6 +356,15 @@ def hdf5_empty(tmp_path: Path) -> str: return str(filepath) +@pytest.fixture +def hdf5_missing_value(tmp_path: Path) -> str: + """Create an empty HDF5 file.""" + filepath = tmp_path / "compact_lowlevel.h5" + with h5py.File(filepath, "w") as f: + f.create_dataset("my_dataset", shape=(10,), dtype="int64") + return str(filepath) + + @pytest.fixture def hdf5_scalar(tmp_path: Path) -> str: """Create an HDF5 file with a scalar dataset.""" diff --git a/docs/releases.md b/docs/releases.md index f28cc441..32ac6f22 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -33,6 +33,9 @@ ### Internal changes +- `DMRPPParser` is now migrated to [pydap](https://github.com/pydap/pydap) (`pydap>=3.5.10`) (see [pydap#417](https://github.com/pydap/pydap/issues/417)). To use the dmrpp parser one must install the extra `pip install "virtualizarr[dmrpp]"`. + By [Miguel Jimenez-Urias](https://github.com/Mikejmnez) + - Speed up virtual chunk container validation when writing to Icechunk by passing the supported prefixes as a tuple to `str.startswith`, which runs the loop over prefixes in C rather than in a Python generator. The per-reference check is now ~2.6x faster in the common single-container case, which matters when writing manifests with millions of virtual references. The check is still a per-reference Python loop overall (see [icechunk#1167](https://github.com/earth-mover/icechunk/issues/1167) for pushing it down to Icechunk entirely). By [Tom Nicholas](https://github.com/TomNicholas). @@ -249,6 +252,7 @@ Brings `region`-writing support in `.to_icechunk()`, a `ZarrParser` with orders - Fix `ZarrParser` to use public attribute instead of private one ([#916](https://github.com/zarr-developers/VirtualiZarr/pull/916)). By [Max Jones](https://github.com/maxrjones). + ### Documentation - Added FAQ answer comparing the Kerchunk and Icechunk serialization formats. ([#818](https://github.com/zarr-developers/VirtualiZarr/pull/818)). diff --git a/pyproject.toml b/pyproject.toml index 827e60f4..a7c6c652 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,11 @@ hdf = [ "imagecodecs-numcodecs==2024.6.1", ] +# dmrpp +dmrpp = [ + "pydap @ git+https://github.com/pydap/pydap.git@refs/pull/697/head", +] + zarr = ["arro3-core"] # kerchunk-based parsers @@ -83,7 +88,8 @@ all_parsers = [ "virtualizarr[kerchunk_parquet]", "virtualizarr[tiff]", "virtualizarr[grib]", - "virtualizarr[zarr]" + "virtualizarr[zarr]", + "virtualizarr[dmrpp]", ] # writers @@ -214,8 +220,8 @@ run-flaky-tests = { cmd = "pytest -m flaky --run-flaky --run-network-tests --ver min-deps = ["dev", "test", "hdf", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs # Inherit from min-deps to get all the test commands, along with optional dependencies test = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "zarr", "py314"] -test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "grib", "zarr", "py312"] # test against python 3.13 -test-py313 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "grib", "zarr", "py313"] # test against python 3.13 +test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "grib", "dmrpp", "zarr", "py312"] # test against python 3.13 +test-py313 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "grib", "dmrpp", "zarr", "py313"] # test against python 3.13 test-py314 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "zarr", "py314"] # test against python 3.14 minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "py314", "zarr", "minio"] # tiff (virtual-tiff) is excluded: virtual-tiff 0.5.0 imports zarr's CodecJSON @@ -267,6 +273,7 @@ module = [ "ujson", "zarr", "requests", + "pydap.*", ] ignore_missing_imports = true diff --git a/virtualizarr/parsers/dmrpp.py b/virtualizarr/parsers/dmrpp.py index c6b7b3aa..ad83a4a4 100644 --- a/virtualizarr/parsers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -1,23 +1,11 @@ -import io -import warnings -from pathlib import Path -from typing import Any, Iterable -from xml.etree import ElementTree as ET +from typing import Iterable -import numpy as np -from obspec_utils.protocols import ReadableStore -from obspec_utils.readers import EagerStoreReader from obspec_utils.registry import ObjectStoreRegistry -from virtualizarr.manifests import ( - ChunkManifest, - ManifestArray, - ManifestGroup, - ManifestStore, -) -from virtualizarr.manifests.utils import create_v3_array_metadata -from virtualizarr.parsers.utils import encode_cf_fill_value -from virtualizarr.types import ChunkKey +from virtualizarr.manifests import ManifestStore +from virtualizarr.utils import soft_import + +pydap = soft_import("pydap", "parsing dmrpp references", strict=False) class DMRPPParser: @@ -60,594 +48,13 @@ def __call__( ManifestStore A ManifestStore that provides a Zarr representation of the data source referenced by the DMR++ file. """ - store, path_in_store = registry.resolve(url) - reader = EagerStoreReader(store=store, path=path_in_store) - file_bytes = reader.readall() - stream = io.BytesIO(file_bytes) + + from pydap.virtualizarr.parser import DMRParser parser = DMRParser( - root=ET.parse(stream).getroot(), - data_filepath=url.removesuffix(".dmrpp"), + url=url, + object_store=registry, skip_variables=self.skip_variables, ) - manifest_store = parser.parse_dataset(object_store=store, group=self.group) + manifest_store = parser.parse_dataset(group=self.group) return manifest_store - - -class DMRParser: - """ - Parser for the OPeNDAP DMR++ XML format. - Reads groups, dimensions, coordinates, data variables, encoding, chunk manifests, and attributes. - Highly modular to allow support for older dmrpp schema versions. Includes many utility functions to extract - different information such as finding all variable tags, splitting hdf5 groups, parsing dimensions, and more. - - OPeNDAP DMR++ homepage: https://docs.opendap.org/index.php/DMR%2B%2B - """ - - # DAP and DMRPP XML namespaces - _NS = { - "dap": "http://xml.opendap.org/ns/DAP/4.0#", - "dmrpp": "http://xml.opendap.org/dap/dmrpp/1.0.0#", - } - # DAP data types to numpy data types - _DAP_NP_DTYPE = { - "Byte": "uint8", - "UByte": "uint8", - "Int8": "int8", - "UInt8": "uint8", - "Int16": "int16", - "UInt16": "uint16", - "Int32": "int32", - "UInt32": "uint32", - "Int64": "int64", - "UInt64": "uint64", - "Url": "object", - "Float32": "float32", - "Float64": "float64", - "String": "object", - } - # Default zlib compression value - _DEFAULT_ZLIB_VALUE = 6 - # Encoding keys that should be removed from attributes and placed in xarray encoding dict - # _ENCODING_KEYS = {"_FillValue", "missing_value", "scale_factor", "add_offset"} - root: ET.Element - data_filepath: str - - def __init__( - self, - root: ET.Element, - data_filepath: str | None = None, - skip_variables: Iterable[str] | None = None, - ): - """ - Initialize the DMRParser with the given DMR++ file contents and source data file path. - - Parameters - ---------- - root - Root of the xml tree structure of a DMR++ file. - data_filepath - The path to the actual data file that will be set in the chunk manifests. - If None, the data file path is taken from the DMR++ file. - """ - self.root = root - self._validation_issues: list[str] = [] - data_filepath_from_root = self._get_attrib(self.root, "name", required=True) - assert data_filepath_from_root is not None # required=True guarantees non-None - self.data_filepath = ( - data_filepath if data_filepath is not None else data_filepath_from_root - ) - self.skip_variables = skip_variables or () - - def _get_attrib( - self, element: ET.Element, attrib_name: str, required: bool = False - ) -> str | None: - """ - Safely get an attribute from an XML element, logging validation issues. - - Parameters - ---------- - element - The XML element to get the attribute from. - attrib_name - The name of the attribute to get. - required - If True, raises a ValueError when the attribute is missing. If False, - returns None and logs the issue. - - Returns - ------- - str | None - The attribute value if found, None otherwise. - - Raises - ------ - ValueError - If required is True and the attribute is not found. - """ - if attrib_name in element.attrib: - return element.attrib[attrib_name] - - element_info = ( - element.tag - if "name" not in element.attrib - else f"{element.tag}[@name='{element.attrib['name']}']" - ) - issue_msg = ( - f"Missing required attribute '{attrib_name}' in element: {element_info}" - ) - self._validation_issues.append(issue_msg) - - if required: - raise ValueError(issue_msg) - - return None - - def parse_dataset( - self, - object_store: ReadableStore, - group: str | None = None, - ) -> ManifestStore: - """ - Parses the given file and creates a ManifestStore. - - Parameters - ---------- - group - The group to parse. Ignored if no groups are present, and the entire - dataset is parsed. If `None` or "/", and groups are present, the first group - is parsed. If not `None` or "/", and no groups are present, a UserWarning - is issued indicating that the group will be ignored. - - Returns - ------- - ManifestStore - - Examples - -------- - Open a sample DMR++ file and parse the dataset - """ - group = group or "/" - ngroups = len(self.root.findall("dap:Group", self._NS)) - - if ngroups == 0 and group != "/": - warnings.warn( - f"No groups in DMR++ file {self.data_filepath!r}; " - f"ignoring group parameter {group!r}" - ) - - group_path = Path("/") if ngroups == 0 else Path("/") / group.removeprefix("/") - dataset_element = self._split_groups(self.root).get(group_path) - - if dataset_element is None: - raise ValueError( - f"Group {group_path} not found in DMR++ file {self.data_filepath!r}" - ) - - manifest_group = self._parse_dataset(dataset_element) - registry: ObjectStoreRegistry = ObjectStoreRegistry() - registry.register(self.data_filepath, object_store) - - return ManifestStore(registry=registry, group=manifest_group) - - def find_node_fqn(self, fqn: str) -> ET.Element: - """ - Find the element in the root element by converting the fully qualified name to an xpath query. - - E.g. fqn = "/a/b" --> root.find("./*[@name='a']/*[@name='b']") - - See more about OPeNDAP fully qualified names (FQN) here: https://docs.opendap.org/index.php/DAP4:_Specification_Volume_1#Fully_Qualified_Names - - Parameters - ---------- - fqn - The fully qualified name of an element. For example, "/a/b". - - Returns - ------- - ET.Element - The matching node found within the root element. - - Raises - ------ - ValueError - If the fully qualified name is not found in the root element. - """ - if fqn == "/": - return self.root - - elements = fqn.strip("/").split("/") # /a/b/ --> ['a', 'b'] - xpath_segments = [f"*[@name='{element}']" for element in elements] - xpath_query = "/".join([".", *xpath_segments]) # "./[*[@name='a']/*[@name='b']" - - if (element := self.root.find(xpath_query, self._NS)) is None: - raise ValueError(f"Path {fqn} not found in provided root") - - return element - - def _split_groups(self, root: ET.Element) -> dict[Path, ET.Element]: - """ - Split the input element into several ET.Elements by name. - E.g. {"/": , "left": , "right": } - - Parameters - ---------- - root : ET.Element - The root element of the DMR file. - - Returns - ------- - dict[Path, ET.Element] - """ - all_groups: dict[Path, ET.Element] = {} - dataset_tags = [ - d for d in root if d.tag != "{" + self._NS["dap"] + "}" + "Group" - ] - if len(dataset_tags) > 0: - all_groups[Path("/")] = ET.Element(root.tag, root.attrib) - all_groups[Path("/")].extend(dataset_tags) - all_groups.update(self._split_groups_recursive(root, Path("/"))) - return all_groups - - def _split_groups_recursive( - self, root: ET.Element, current_path=Path("") - ) -> dict[Path, ET.Element]: - group_dict: dict[Path, ET.Element] = {} - for g in root.iterfind("dap:Group", self._NS): - group_name = self._get_attrib(g, "name", required=True) - if group_name is None: - continue - new_path = current_path / Path(group_name) - dataset_tags = [ - d for d in g if d.tag != "{" + self._NS["dap"] + "}" + "Group" - ] - group_dict[new_path] = ET.Element(g.tag, g.attrib) - group_dict[new_path].extend(dataset_tags) - group_dict.update(self._split_groups_recursive(g, new_path)) - return group_dict - - def _parse_dataset( - self, - root: ET.Element, - ) -> ManifestGroup: - """ - Parse the dataset using the root element of the DMR++ file. - - Parameters - ---------- - root : ET.Element - The root element of the DMR++ file. - - Returns - ------- - ManifestGroup - """ - - manifest_dict: dict[str, ManifestArray] = {} - for var_tag in self._find_var_tags(root): - var_name = self._get_attrib(var_tag, "name") - if var_name and var_name not in self.skip_variables: - try: - variable = self._parse_variable(var_tag) - manifest_dict[var_name] = variable - except (UnboundLocalError, ValueError): - warnings.warn( - f"This DMRpp contains the variable {var_name} that could not" - " be parsed. Consider adding it to the list of skipped " - "variables, or opening an issue to help resolve this" - ) - - # Attributes - attrs: dict[str, str] = {} - # Look for an attribute tag called "HDF5_GLOBAL" and unpack it - hdf5_global_attrs = root.find("dap:Attribute[@name='HDF5_GLOBAL']", self._NS) - if hdf5_global_attrs is not None: - # Remove the container attribute and add its children to the root dataset - root.remove(hdf5_global_attrs) - root.extend(hdf5_global_attrs) - for attr_tag in root.iterfind("dap:Attribute", self._NS): - attrs.update(self._parse_attribute(attr_tag)) - - return ManifestGroup( - arrays=manifest_dict, - attributes=attrs, - ) - - def _find_var_tags(self, root: ET.Element) -> list[ET.Element]: - """ - Find all variable tags in the DMR++ file. Also known as array tags. - Tags are labeled with the DAP data type. E.g. , , - - Parameters - ---------- - root : ET.Element - The root element of the DMR++ file. - - Returns - ------- - list[ET.Element] - """ - vars_tags: list[ET.Element] = [] - for dap_dtype in self._DAP_NP_DTYPE: - vars_tags += root.findall(f"dap:{dap_dtype}", self._NS) - return vars_tags - - def _parse_dim(self, root: ET.Element, dim_index: int = 0) -> dict[str, int]: - """ - Parse single or tag - - If the tag has no name attribute, it is a phony dimension. E.g. --> {"phony_dim_0": 300} - If the tag has both name and size attributes, it is a regular dimension. E.g. --> {"lat": 1447} - - Parameters - ---------- - root : ET.Element - The root element Dim/Dimension tag - dim_index : int - Index of the dimension, used for naming phony dimensions - - Returns - ------- - dict - E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim_0": 300}, {"time": None, "lat": None, "lon": None} - """ - size_attr = self._get_attrib(root, "size") - name_attr = self._get_attrib(root, "name") - - if size_attr is not None: - size = int(size_attr) - if name_attr is not None: - return {Path(name_attr).name: size} - else: - return {f"phony_dim_{dim_index}": size} - - raise ValueError("Not enough information to parse Dim/Dimension tag") - - def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]: - """ - Find the all tags with dimension information. - - First attempts to find Dimension tags, then falls back to Dim tags. - If Dim tags are found, the fully qualified name is used to find the corresponding Dimension tag. - If Dim tags have no name attribute, they are phony dimensions and used directly. - - Parameters - ---------- - root : ET.Element - An ElementTree Element from a DMR++ file. - - Returns - ------- - list[ET.Element] - """ - dimension_tags = root.findall("dap:Dimension", self._NS) - if not dimension_tags: - # Dim tags contain a fully qualified name that references a Dimension tag elsewhere in the DMR++ - # or they are phony dimensions (have size but no name) - dim_tags = root.findall("dap:Dim", self._NS) - for d in dim_tags: - dim_name = self._get_attrib(d, "name") - if dim_name is not None: - dimension_tag = self.find_node_fqn(dim_name) - if dimension_tag is not None: - dimension_tags.append(dimension_tag) - else: - # Phony dimension - use the Dim tag directly - dimension_tags.append(d) - return dimension_tags - - def _parse_variable(self, var_tag: ET.Element) -> ManifestArray: - """ - Parse a variable from a DMR++ tag. - - Parameters - ---------- - var_tag : ET.Element - An ElementTree Element representing a variable in the DMR++ file. Will have DAP dtype as tag. E.g. - - Returns - ------- - ManifestArray - """ - - # Dimension info - dims: dict[str, int] = {} - dimension_tags = self._find_dimension_tags(var_tag) - for dim_index, dim in enumerate(dimension_tags): - dims.update(self._parse_dim(dim, dim_index=dim_index)) - # convert DAP dtype to numpy dtype - dtype = np.dtype( - self._DAP_NP_DTYPE[var_tag.tag.removeprefix("{" + self._NS["dap"] + "}")] - ) - # Chunks and Filters - shape: tuple[int, ...] = tuple(dims.values()) - chunks_shape = shape - chunks_tag = var_tag.find("dmrpp:chunks", self._NS) - array_fill_value = np.array(0).astype(dtype)[()] - if chunks_tag is not None: - # Chunks - chunk_dim_text = chunks_tag.findtext( - "dmrpp:chunkDimensionSizes", namespaces=self._NS - ) - if chunk_dim_text is not None: - # 1 1447 2895 -> (1, 1447, 2895) - chunks_shape = tuple(map(int, chunk_dim_text.split())) - else: - chunks_shape = shape - fill_value = self._get_attrib(chunks_tag, "fillValue") - if fill_value is not None: - array_fill_value = np.array(fill_value).astype(dtype)[()] - if chunks_shape: - chunkmanifest = self._parse_chunks(chunks_tag, chunks_shape) - else: - chunkmanifest = ChunkManifest(entries={}, shape=array_fill_value.shape) - # Filters - codecs = self._parse_filters(chunks_tag, dtype) - - # Attributes - attrs: dict[str, Any] = {} - for attr_tag in var_tag.iterfind("dap:Attribute", self._NS): - attrs.update(self._parse_attribute(attr_tag)) - if "_FillValue" in attrs: - encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype) - attrs["_FillValue"] = encoded_cf_fill_value - - metadata = create_v3_array_metadata( - shape=shape, - data_type=dtype, - chunk_shape=chunks_shape, - codecs=codecs, - dimension_names=dims, - attributes=attrs, - fill_value=array_fill_value, - ) - return ManifestArray(metadata=metadata, chunkmanifest=chunkmanifest) - - def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]: - """ - Parse an attribute from a DMR++ attr tag. Converts the attribute value to a native python type. - Raises an exception if nested attributes are passed. Container attributes must be unwrapped in the parent function. - - Parameters - ---------- - attr_tag : ET.Element - An ElementTree Element with an tag. - - Returns - ------- - dict - """ - attr: dict[str, Any] = {} - values = [] - attr_type = self._get_attrib(attr_tag, "type") - attr_name = self._get_attrib(attr_tag, "name", required=True) - - if attr_name is None: - return {} - - if attr_type == "Container": - # DMR++ build information that is not part of the dataset - if attr_name == "build_dmrpp_metadata": - return {} - else: - warnings.warn( - "This DMRpp contains a nested attribute " - f"{attr_name}. Nested attributes cannot " - "be assigned to a variable or dataset and will be dropped" - ) - return {} - - if attr_type is None: - return {} - - dtype = np.dtype(self._DAP_NP_DTYPE[attr_type]) - # if multiple Value tags are present, store as "key": "[v1, v2, ...]" - for value_tag in attr_tag: - # cast attribute to native python type using dmr provided dtype - val = ( - dtype.type(value_tag.text).item() - if dtype != np.object_ - else value_tag.text - ) - # "*" may represent nan values in DMR++ - if val == "*": - val = np.nan - values.append(val) - attr[attr_name] = values[0] if len(values) == 1 else values - return attr - - def _parse_filters( - self, chunks_tag: ET.Element, dtype: np.dtype - ) -> list[dict] | None: - """ - Parse filters from a DMR++ chunks tag. - - Parameters - ---------- - chunks_tag : ET.Element - An ElementTree Element with a tag. - - dtype : np.dtype - The numpy dtype of the variable. - - Returns - ------- - list[dict] | None - E.g. [{"id": "shuffle", "elementsize": 4}, {"id": "zlib", "level": 4}] - """ - compression_type = self._get_attrib(chunks_tag, "compressionType") - if compression_type is not None: - filters: list[dict] = [] - # shuffle deflate --> ["shuffle", "deflate"] - compression_types = compression_type.split(" ") - for c in compression_types: - if c == "shuffle": - filters.append( - { - "name": "numcodecs.shuffle", - "configuration": {"elementsize": dtype.itemsize}, - } - ) - elif c == "deflate": - deflate_level = self._get_attrib(chunks_tag, "deflateLevel") - level = ( - int(deflate_level) - if deflate_level is not None - else self._DEFAULT_ZLIB_VALUE - ) - filters.append( - { - "name": "numcodecs.zlib", - "configuration": { - "level": level, - }, - } - ) - return filters - return None - - def _parse_chunks( - self, chunks_tag: ET.Element, chunks_shape: tuple[int, ...] - ) -> ChunkManifest: - """ - Parse the chunk manifest from a DMR++ chunks tag. - - Parameters - ---------- - chunks_tag : ET.Element - An ElementTree Element with a tag. - - chunks_shape : tuple - Chunk sizes for each dimension. E.g. (1, 1447, 2895) - - Returns - ------- - ChunkManifest - """ - chunkmanifest: dict[ChunkKey, object] = {} - default_num: list[int] = ( - [0 for i in range(len(chunks_shape))] if chunks_shape else [0] - ) - chunk_key_template = ".".join(["{}" for i in range(len(default_num))]) - for chunk_tag in chunks_tag.iterfind("dmrpp:chunk", self._NS): - chunk_num = default_num - chunk_position = self._get_attrib(chunk_tag, "chunkPositionInArray") - if chunk_position is not None: - # "[0,1023,10235]" -> ["0","1023","10235"] - chunk_pos = chunk_position[1:-1].split(",") - # [0,1023,10235] // [1, 1023, 2047] -> [0,1,5] - chunk_num = [ - int(chunk_pos[i]) // chunks_shape[i] - for i in range(len(chunks_shape)) - ] - # [0,1,5] -> "0.1.5" - chunk_key = ChunkKey(chunk_key_template.format(*chunk_num)) - offset = self._get_attrib(chunk_tag, "offset", required=True) - n_bytes = self._get_attrib(chunk_tag, "nBytes", required=True) - if offset is not None and n_bytes is not None: - chunkmanifest[chunk_key] = { - "path": self.data_filepath, - "offset": int(offset), - "length": int(n_bytes), - } - return ChunkManifest(entries=chunkmanifest) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 026b7c66..78ff40a3 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -44,3 +44,5 @@ def _importorskip( has_arro3, requires_arro3 = _importorskip("arro3.core") # The GribberishParser is new in gribberish 1.0.0. has_grib, requires_grib = _importorskip("gribberish", minversion="1.0.0") +# The DMRPPParser is new in pydap 3.5.10 +has_pydap, requires_pydap = _importorskip("pydap") diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index e04d6833..fd90b042 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -1,21 +1,16 @@ -import platform -import textwrap -from contextlib import nullcontext -from pathlib import Path -from xml.etree import ElementTree as ET - import pytest +import requests import xarray as xr -import xarray.testing as xrt from obspec_utils.registry import ObjectStoreRegistry -from packaging import version from virtualizarr.parsers import DMRPPParser, HDFParser -from virtualizarr.parsers.dmrpp import DMRParser -from virtualizarr.tests import requires_network, slow_test +from virtualizarr.tests import requires_network, requires_pydap, slow_test from virtualizarr.tests.utils import obstore_local, obstore_s3 from virtualizarr.xarray import open_virtual_dataset +pytest.importorskip("pydap") +from pydap.virtualizarr.parser import DMRParser # noqa: E402 + urls = [ ( "s3://its-live-data/test-space/cloud-experiments/dmrpp/20240826090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc", @@ -24,373 +19,10 @@ # TODO: later add MUR, SWOT, TEMPO and others by using kerchunk JSON to read refs (rather than reading the whole netcdf file) ] -# This DMRPP was created following the instructions from https://opendap.github.io/DMRpp-wiki/DMRpp.html#sec-build-them on the files produced by conftest.py with the key matching the fixture name. -DMRPP_XML_STRINGS = { - "netcdf4_file": textwrap.dedent( - """\ - - - - - - - - - NaN - - - latitude - - - Latitude - - - degrees_north - - - Y - - - - - - - - - NaN - - - longitude - - - Longitude - - - degrees_east - - - X - - - - - - - - - NaN - - - time - - - Time - - - hours since 1800-01-01 - - - standard - - - - - - - - - - - 4xDaily Air temperature at sigma level 995 - - - degK - - - 2 - - - 11 - - - TMP - - - Air temperature - - - NMC Reanalysis - - - Surface - - - Individual Obs - - - Other - - - 185.1600037 - 322.1000061 - - - 0.01 - - - - - - - - - - COARDS - - - 4x daily NMC reanalysis (1948) - - - Data is from NMC initialized reanalysis - (4x/day). These are the 0.9950 sigma level values. - - - Model - - - http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanalysis.html - - - - 2025-07-16T18:48:42Z - - - 3.21.1-451 - - - 3.21.1-451 - - - libdap-3.21.1-178 - - - build_dmrpp -f /usr/share/hyrax/air.nc -r air.nc.dmr -u OPeNDAP_DMRpp_DATA_ACCESS_URL -M - - - - """ - ), - "hdf5_groups_file": textwrap.dedent( - """\ - - - - - 2025-07-16T21:57:57Z - - - 3.21.1-451 - - - 3.21.1-451 - - - libdap-3.21.1-178 - - - build_dmrpp -f /usr/share/hyrax/hdf5_groups_file.nc -r hdf5_groups_file.nc.dmr -u OPeNDAP_DMRpp_DATA_ACCESS_URL -M - - - - - - - - - - - NaN - - - latitude - - - Latitude - - - degrees_north - - - Y - - - - - - - - - NaN - - - longitude - - - Longitude - - - degrees_east - - - X - - - - - - - - - NaN - - - time - - - Time - - - hours since 1800-01-01 - - - standard - - - - - - - - - - - 4xDaily Air temperature at sigma level 995 - - - degK - - - 2 - - - 11 - - - TMP - - - Air temperature - - - NMC Reanalysis - - - Surface - - - Individual Obs - - - Other - - - 185.1600037 - 322.1000061 - - - 0.01 - - - - - - - - - - COARDS - - - 4x daily NMC reanalysis (1948) - - - Data is from NMC initialized reanalysis - (4x/day). These are the 0.9950 sigma level values. - - - Model - - - http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanalysis.html - - - - - """ - ), - "fill_value_scalar_no_chunks_nc4_url": textwrap.dedent( - """\ - - - - - -999 - - - - - empty scalar data - - - - 2025-08-14T23:32:01Z - - - container attributes are no longer supported - - - - - 2025-08-14T23:32:01Z - - - 3.21.1-477 - - - 3.21.1-477 - - - libdap-3.21.1-222 - - - build_dmrpp -f /usr/share/hyrax/fill_value_scalar_no_chunks.nc4 -r fill_value_scalar_no_chunks.nc4.dmr -u OPeNDAP_DMRpp_DATA_ACCESS_URL -M - - - - """ - ), -} - - -def dmrparser(dmrpp_xml_str: str, filepath: str) -> DMRParser: - # TODO we should actually create a dmrpp file in a temporary directory - # this would avoid the need to pass tmp_path separately - - return DMRParser(root=ET.fromstring(dmrpp_xml_str), data_filepath=filepath) - @slow_test @requires_network +@requires_pydap @pytest.mark.parametrize("data_url, dmrpp_url", urls) def test_NASA_dmrpp(data_url, dmrpp_url): store = obstore_s3( @@ -417,6 +49,7 @@ def test_NASA_dmrpp(data_url, dmrpp_url): @requires_network +@requires_pydap @slow_test @pytest.mark.parametrize("data_url, dmrpp_url", urls) def test_NASA_dmrpp_load(data_url, dmrpp_url): @@ -435,288 +68,15 @@ def test_NASA_dmrpp_load(data_url, dmrpp_url): assert ds.load() -@pytest.mark.parametrize( - "fqn_path, expected_xpath", - [ - ("/", "."), - ("/air", "./*[@name='air']"), - ], -) -def test_find_node_fqn_simple(netcdf4_file, fqn_path, expected_xpath): - parser_instance = dmrparser( - DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file - ) - result = parser_instance.find_node_fqn(fqn_path) - expected = parser_instance.root.find(expected_xpath, parser_instance._NS) - assert result == expected - - -def test_find_node_fqn_grouped(hdf5_groups_file): - parser_instance = dmrparser( - DMRPP_XML_STRINGS["hdf5_groups_file"], filepath=hdf5_groups_file - ) - result = parser_instance.find_node_fqn("/test/group/air") - expected = parser_instance.root.find( - "./*[@name='test']/*[@name='group']/*[@name='air']", parser_instance._NS - ) - assert result == expected - - -@pytest.mark.parametrize( - "group_path", - [ - ("/"), - ("/test"), - ("/test/group"), - ], -) -def test_split_groups(hdf5_groups_file, group_path): - dmrpp_instance = dmrparser( - DMRPP_XML_STRINGS["hdf5_groups_file"], filepath=hdf5_groups_file - ) - - # get all tags in a dataset (so all tags excluding nested groups) - dataset_tags = lambda x: [ - d for d in x if d.tag != "{" + dmrpp_instance._NS["dap"] + "}" + "Group" - ] - # check that contents of the split groups dataset match contents of the original dataset - result_tags = dataset_tags( - dmrpp_instance._split_groups(dmrpp_instance.root)[Path(group_path)] - ) - expected_tags = dataset_tags(dmrpp_instance.find_node_fqn(group_path)) - assert result_tags == expected_tags - - -@pytest.mark.xfail( - platform.system() == "Linux", - reason="See https://github.com/zarr-developers/VirtualiZarr/issues/904.", -) -@pytest.mark.parametrize( - "group,warns", - [ - pytest.param(None, False, id="None"), - pytest.param("/", False, id="/"), - pytest.param("/no-such-group", True, id="/no-such-group"), - ], -) -def test_parse_dataset(group: str | None, warns: bool, netcdf4_file): - drmpp = dmrparser( - DMRPP_XML_STRINGS["netcdf4_file"], filepath=f"file://{netcdf4_file}" - ) - store = obstore_local(url=drmpp.data_filepath) - - with nullcontext() if warns else pytest.raises(BaseException, match="DID NOT WARN"): - with pytest.warns(UserWarning, match=f"ignoring group parameter {group!r}"): - ms = drmpp.parse_dataset(object_store=store, group=group) - - vds = ms.to_virtual_dataset() - - assert vds.sizes == {"lat": 25, "lon": 53, "time": 2920} - assert vds.data_vars.keys() == {"air"} - assert vds.data_vars["air"].dims == ("time", "lat", "lon") - assert vds.attrs == { - "Conventions": "COARDS", - "title": "4x daily NMC reanalysis (1948)", - "description": "Data is from NMC initialized reanalysis\n(4x/day). These are the 0.9950 sigma level values.", - "platform": "Model", - "references": "http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanalysis.html", - } - assert vds.coords.keys() == {"lat", "lon", "time"} - - -@pytest.mark.xfail( - version.parse(xr.__version__) < version.parse("2025.7.1"), - reason="Offsets in file changed", -) -def test_parse_dataset_nested(hdf5_groups_file): - nested_groups_dmrpp = dmrparser( - DMRPP_XML_STRINGS["hdf5_groups_file"], filepath=f"file://{hdf5_groups_file}" - ) - store = obstore_local(url=f"file://{nested_groups_dmrpp.data_filepath}") - - vds_root_implicit = nested_groups_dmrpp.parse_dataset( - object_store=store - ).to_virtual_dataset(loadable_variables=[]) - vds_root = nested_groups_dmrpp.parse_dataset( - group="/", object_store=store - ).to_virtual_dataset(loadable_variables=[]) - - xrt.assert_identical(vds_root_implicit, vds_root) - assert vds_root.sizes == {} - - vds_g1 = nested_groups_dmrpp.parse_dataset( - group="/test", object_store=store - ).to_virtual_dataset(loadable_variables=[]) - assert vds_g1.sizes == {} - - vds_g2 = nested_groups_dmrpp.parse_dataset( - group="/test/group", object_store=store - ).to_virtual_dataset(loadable_variables=[]) - - assert vds_g2.sizes == {"time": 2920, "lat": 25, "lon": 53} - assert vds_g2.data_vars.keys() == {"air"} - assert vds_g2.data_vars["air"].dims == ("time", "lat", "lon") - - -def test_parse_variable(netcdf4_file): - parser = dmrparser(DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file) - - var = parser._parse_variable(parser.find_node_fqn("/air")) - assert var.metadata.dtype.to_native_dtype() == "int16" - assert var.metadata.dimension_names == ("time", "lat", "lon") - assert var.shape == (2920, 25, 53) - assert var.metadata.chunks == (2920, 25, 53) - # _FillValue is encoded for array dtype - assert var.metadata.attributes["scale_factor"] == 0.01 - assert ( - var.metadata.attributes["long_name"] - == "4xDaily Air temperature at sigma level 995" - ) - - -@pytest.mark.parametrize( - "attr_path, expected", - [ - ("air/long_name", {"long_name": "4xDaily Air temperature at sigma level 995"}), - ("air/scale_factor", {"scale_factor": 0.01}), - ], -) -def test_parse_attribute(netcdf4_file, attr_path, expected): - parser = dmrparser(DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file) - - result = parser._parse_attribute(parser.find_node_fqn(attr_path)) - assert result == expected - - -def test_dmrpp_empty_scalar_warns_container(fill_value_scalar_no_chunks_nc4_url): - parsed_dmrpp = dmrparser( - DMRPP_XML_STRINGS["fill_value_scalar_no_chunks_nc4_url"], - filepath=fill_value_scalar_no_chunks_nc4_url, - ) - store = obstore_local(url=f"file://{parsed_dmrpp.data_filepath}") - with pytest.warns(UserWarning): - parsed_vds = parsed_dmrpp.parse_dataset(object_store=store) - vds_g1 = parsed_vds.to_virtual_dataset() - assert vds_g1["data"].attrs == {"_FillValue": -999} - - -def test_dmrpp_phony_dim_naming(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - - - NaN - - - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///phony_test.nc") - var = parser._parse_variable(parser.find_node_fqn("/data")) - assert var.metadata.dimension_names == ("phony_dim_0", "phony_dim_1") - assert var.shape == (10, 20) - - -def test_dmrpp_validation_issues_accumulation(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - - - 1.0 - - - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///validation_test.nc") - parser._parse_dataset(parser.root) - assert len(parser._validation_issues) > 0 - assert any( - "Missing required attribute 'name'" in issue - for issue in parser._validation_issues - ) - - -def test_dmrpp_get_attrib_with_missing_optional(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///test.nc") - dimension = parser.root.find("dap:Dimension", parser._NS) - result = parser._get_attrib(dimension, "nonexistent") - assert result is None - assert len(parser._validation_issues) == 1 - - -def test_dmrpp_get_attrib_with_required_missing(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///test.nc") - dimension = parser.root.find("dap:Dimension", parser._NS) - with pytest.raises(ValueError, match="Missing required attribute 'nonexistent'"): - parser._get_attrib(dimension, "nonexistent", required=True) - - -def test_dmrpp_mixed_named_and_unnamed_dimensions(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - - - - - - NaN - - - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///mixed_test.nc") - var = parser._parse_variable(parser.find_node_fqn("/data")) - assert var.metadata.dimension_names == ("time", "phony_dim_1", "lat") - assert var.shape == (10, 20, 30) - - +@requires_pydap def test_dmrpp_simple(dmrpp_xml_simple): """Test parsing a simple valid DMR++ XML creates virtual chunk manifests.""" - parser = dmrparser(dmrpp_xml_simple, filepath="file:///simple.nc") + parser = DMRParser( + dmrpp_xml_simple, object_store=obstore_local(url="file:///simple.nc") + ) # Parse dataset - manifest_store = parser.parse_dataset( - object_store=obstore_local(url="file:///"), group="/" - ) + manifest_store = parser.parse_dataset(group="/") # Verify manifest store is created assert manifest_store is not None @@ -744,16 +104,16 @@ def test_dmrpp_simple(dmrpp_xml_simple): assert isinstance(chunk_info["length"], int) +@requires_pydap def test_dmrpp_missing_attrib_validation(dmrpp_xml_with_missing_attrib): """Test that validation issues are accumulated for missing attributes.""" - parser = dmrparser( - dmrpp_xml_with_missing_attrib, filepath="file:///validation_test.nc" + parser = DMRParser( + dmrpp_xml_with_missing_attrib, + object_store=obstore_local(url="file:///validation_test.nc"), ) # Parse dataset - this should accumulate validation issues - manifest_store = parser.parse_dataset( - object_store=obstore_local(url="file:///"), group="/" - ) + manifest_store = parser.parse_dataset(group="/") # Verify that validation issues were accumulated assert len(parser._validation_issues) > 0 @@ -767,3 +127,25 @@ def test_dmrpp_missing_attrib_validation(dmrpp_xml_with_missing_attrib): # Verify manifest store was still created (parser continues despite issues) assert manifest_store is not None assert manifest_store._group is not None + + +@requires_network +@requires_pydap +def test_inlinevalue(): + """ + Test that inline values can be parsed into manifest + """ + expected_bytes = b"AAAAAAAAAAABAAAAAAAAAAIAAAAAAAAAAwAAAAAAAAAEAAAAAAAAAAUAAAAAAAAABgAAAAAAAAAHAAAAAAAAAAgAAAAAAAAACQAAAAAAAAA=" + + dmrpp_file = ( + "http://test.opendap.org/opendap/data/dmrpp/compact_lowlevel.h5.dmrpp.file" + ) + session = requests.Session() + dmrpp = session.get(dmrpp_file).content.decode() + parser = DMRParser( + dmrpp, object_store=obstore_local(url="file:///compact_lowlevel.h5") + ) + ms = parser.parse_dataset() + vds = ms.to_virtual_dataset() + assert "my_dataset" in vds + assert ms._group["my_dataset"]._manifest._inlined == {(0, 0): expected_bytes}