From 3c4076f74c59e50e781c57ddd59be648fa5ee01d Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Wed, 4 Mar 2026 16:43:58 -0800 Subject: [PATCH 01/22] remove migrated tests --- virtualizarr/tests/test_parsers/test_dmrpp.py | 121 ------------------ 1 file changed, 121 deletions(-) diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index e04d6833..01646250 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -588,127 +588,6 @@ def test_parse_attribute(netcdf4_file, attr_path, expected): assert result == expected -def test_dmrpp_empty_scalar_warns_container(fill_value_scalar_no_chunks_nc4_url): - parsed_dmrpp = dmrparser( - DMRPP_XML_STRINGS["fill_value_scalar_no_chunks_nc4_url"], - filepath=fill_value_scalar_no_chunks_nc4_url, - ) - store = obstore_local(url=f"file://{parsed_dmrpp.data_filepath}") - with pytest.warns(UserWarning): - parsed_vds = parsed_dmrpp.parse_dataset(object_store=store) - vds_g1 = parsed_vds.to_virtual_dataset() - assert vds_g1["data"].attrs == {"_FillValue": -999} - - -def test_dmrpp_phony_dim_naming(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - - - NaN - - - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///phony_test.nc") - var = parser._parse_variable(parser.find_node_fqn("/data")) - assert var.metadata.dimension_names == ("phony_dim_0", "phony_dim_1") - assert var.shape == (10, 20) - - -def test_dmrpp_validation_issues_accumulation(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - - - 1.0 - - - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///validation_test.nc") - parser._parse_dataset(parser.root) - assert len(parser._validation_issues) > 0 - assert any( - "Missing required attribute 'name'" in issue - for issue in parser._validation_issues - ) - - -def test_dmrpp_get_attrib_with_missing_optional(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///test.nc") - dimension = parser.root.find("dap:Dimension", parser._NS) - result = parser._get_attrib(dimension, "nonexistent") - assert result is None - assert len(parser._validation_issues) == 1 - - -def test_dmrpp_get_attrib_with_required_missing(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///test.nc") - dimension = parser.root.find("dap:Dimension", parser._NS) - with pytest.raises(ValueError, match="Missing required attribute 'nonexistent'"): - parser._get_attrib(dimension, "nonexistent", required=True) - - -def test_dmrpp_mixed_named_and_unnamed_dimensions(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - - - - - - NaN - - - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///mixed_test.nc") - var = parser._parse_variable(parser.find_node_fqn("/data")) - assert var.metadata.dimension_names == ("time", "phony_dim_1", "lat") - assert var.shape == (10, 20, 30) - - def test_dmrpp_simple(dmrpp_xml_simple): """Test parsing a simple valid DMR++ XML creates virtual chunk manifests.""" parser = dmrparser(dmrpp_xml_simple, filepath="file:///simple.nc") From bd8d1e6937cc5fb2d9b6bbf5aeeca7b038f4e35e Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Wed, 4 Mar 2026 16:55:22 -0800 Subject: [PATCH 02/22] strip DMRParser class --- virtualizarr/parsers/dmrpp.py | 553 +--------------------------------- 1 file changed, 3 insertions(+), 550 deletions(-) diff --git a/virtualizarr/parsers/dmrpp.py b/virtualizarr/parsers/dmrpp.py index c6b7b3aa..2887e6bb 100644 --- a/virtualizarr/parsers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -1,23 +1,13 @@ import io -import warnings -from pathlib import Path -from typing import Any, Iterable +from typing import Iterable from xml.etree import ElementTree as ET -import numpy as np -from obspec_utils.protocols import ReadableStore from obspec_utils.readers import EagerStoreReader from obspec_utils.registry import ObjectStoreRegistry from virtualizarr.manifests import ( - ChunkManifest, - ManifestArray, - ManifestGroup, ManifestStore, ) -from virtualizarr.manifests.utils import create_v3_array_metadata -from virtualizarr.parsers.utils import encode_cf_fill_value -from virtualizarr.types import ChunkKey class DMRPPParser: @@ -89,27 +79,7 @@ class DMRParser: "dap": "http://xml.opendap.org/ns/DAP/4.0#", "dmrpp": "http://xml.opendap.org/dap/dmrpp/1.0.0#", } - # DAP data types to numpy data types - _DAP_NP_DTYPE = { - "Byte": "uint8", - "UByte": "uint8", - "Int8": "int8", - "UInt8": "uint8", - "Int16": "int16", - "UInt16": "uint16", - "Int32": "int32", - "UInt32": "uint32", - "Int64": "int64", - "UInt64": "uint64", - "Url": "object", - "Float32": "float32", - "Float64": "float64", - "String": "object", - } - # Default zlib compression value - _DEFAULT_ZLIB_VALUE = 6 - # Encoding keys that should be removed from attributes and placed in xarray encoding dict - # _ENCODING_KEYS = {"_FillValue", "missing_value", "scale_factor", "add_offset"} + root: ET.Element data_filepath: str @@ -132,522 +102,5 @@ def __init__( """ self.root = root self._validation_issues: list[str] = [] - data_filepath_from_root = self._get_attrib(self.root, "name", required=True) - assert data_filepath_from_root is not None # required=True guarantees non-None - self.data_filepath = ( - data_filepath if data_filepath is not None else data_filepath_from_root - ) + self.data_filepath = data_filepath self.skip_variables = skip_variables or () - - def _get_attrib( - self, element: ET.Element, attrib_name: str, required: bool = False - ) -> str | None: - """ - Safely get an attribute from an XML element, logging validation issues. - - Parameters - ---------- - element - The XML element to get the attribute from. - attrib_name - The name of the attribute to get. - required - If True, raises a ValueError when the attribute is missing. If False, - returns None and logs the issue. - - Returns - ------- - str | None - The attribute value if found, None otherwise. - - Raises - ------ - ValueError - If required is True and the attribute is not found. - """ - if attrib_name in element.attrib: - return element.attrib[attrib_name] - - element_info = ( - element.tag - if "name" not in element.attrib - else f"{element.tag}[@name='{element.attrib['name']}']" - ) - issue_msg = ( - f"Missing required attribute '{attrib_name}' in element: {element_info}" - ) - self._validation_issues.append(issue_msg) - - if required: - raise ValueError(issue_msg) - - return None - - def parse_dataset( - self, - object_store: ReadableStore, - group: str | None = None, - ) -> ManifestStore: - """ - Parses the given file and creates a ManifestStore. - - Parameters - ---------- - group - The group to parse. Ignored if no groups are present, and the entire - dataset is parsed. If `None` or "/", and groups are present, the first group - is parsed. If not `None` or "/", and no groups are present, a UserWarning - is issued indicating that the group will be ignored. - - Returns - ------- - ManifestStore - - Examples - -------- - Open a sample DMR++ file and parse the dataset - """ - group = group or "/" - ngroups = len(self.root.findall("dap:Group", self._NS)) - - if ngroups == 0 and group != "/": - warnings.warn( - f"No groups in DMR++ file {self.data_filepath!r}; " - f"ignoring group parameter {group!r}" - ) - - group_path = Path("/") if ngroups == 0 else Path("/") / group.removeprefix("/") - dataset_element = self._split_groups(self.root).get(group_path) - - if dataset_element is None: - raise ValueError( - f"Group {group_path} not found in DMR++ file {self.data_filepath!r}" - ) - - manifest_group = self._parse_dataset(dataset_element) - registry: ObjectStoreRegistry = ObjectStoreRegistry() - registry.register(self.data_filepath, object_store) - - return ManifestStore(registry=registry, group=manifest_group) - - def find_node_fqn(self, fqn: str) -> ET.Element: - """ - Find the element in the root element by converting the fully qualified name to an xpath query. - - E.g. fqn = "/a/b" --> root.find("./*[@name='a']/*[@name='b']") - - See more about OPeNDAP fully qualified names (FQN) here: https://docs.opendap.org/index.php/DAP4:_Specification_Volume_1#Fully_Qualified_Names - - Parameters - ---------- - fqn - The fully qualified name of an element. For example, "/a/b". - - Returns - ------- - ET.Element - The matching node found within the root element. - - Raises - ------ - ValueError - If the fully qualified name is not found in the root element. - """ - if fqn == "/": - return self.root - - elements = fqn.strip("/").split("/") # /a/b/ --> ['a', 'b'] - xpath_segments = [f"*[@name='{element}']" for element in elements] - xpath_query = "/".join([".", *xpath_segments]) # "./[*[@name='a']/*[@name='b']" - - if (element := self.root.find(xpath_query, self._NS)) is None: - raise ValueError(f"Path {fqn} not found in provided root") - - return element - - def _split_groups(self, root: ET.Element) -> dict[Path, ET.Element]: - """ - Split the input element into several ET.Elements by name. - E.g. {"/": , "left": , "right": } - - Parameters - ---------- - root : ET.Element - The root element of the DMR file. - - Returns - ------- - dict[Path, ET.Element] - """ - all_groups: dict[Path, ET.Element] = {} - dataset_tags = [ - d for d in root if d.tag != "{" + self._NS["dap"] + "}" + "Group" - ] - if len(dataset_tags) > 0: - all_groups[Path("/")] = ET.Element(root.tag, root.attrib) - all_groups[Path("/")].extend(dataset_tags) - all_groups.update(self._split_groups_recursive(root, Path("/"))) - return all_groups - - def _split_groups_recursive( - self, root: ET.Element, current_path=Path("") - ) -> dict[Path, ET.Element]: - group_dict: dict[Path, ET.Element] = {} - for g in root.iterfind("dap:Group", self._NS): - group_name = self._get_attrib(g, "name", required=True) - if group_name is None: - continue - new_path = current_path / Path(group_name) - dataset_tags = [ - d for d in g if d.tag != "{" + self._NS["dap"] + "}" + "Group" - ] - group_dict[new_path] = ET.Element(g.tag, g.attrib) - group_dict[new_path].extend(dataset_tags) - group_dict.update(self._split_groups_recursive(g, new_path)) - return group_dict - - def _parse_dataset( - self, - root: ET.Element, - ) -> ManifestGroup: - """ - Parse the dataset using the root element of the DMR++ file. - - Parameters - ---------- - root : ET.Element - The root element of the DMR++ file. - - Returns - ------- - ManifestGroup - """ - - manifest_dict: dict[str, ManifestArray] = {} - for var_tag in self._find_var_tags(root): - var_name = self._get_attrib(var_tag, "name") - if var_name and var_name not in self.skip_variables: - try: - variable = self._parse_variable(var_tag) - manifest_dict[var_name] = variable - except (UnboundLocalError, ValueError): - warnings.warn( - f"This DMRpp contains the variable {var_name} that could not" - " be parsed. Consider adding it to the list of skipped " - "variables, or opening an issue to help resolve this" - ) - - # Attributes - attrs: dict[str, str] = {} - # Look for an attribute tag called "HDF5_GLOBAL" and unpack it - hdf5_global_attrs = root.find("dap:Attribute[@name='HDF5_GLOBAL']", self._NS) - if hdf5_global_attrs is not None: - # Remove the container attribute and add its children to the root dataset - root.remove(hdf5_global_attrs) - root.extend(hdf5_global_attrs) - for attr_tag in root.iterfind("dap:Attribute", self._NS): - attrs.update(self._parse_attribute(attr_tag)) - - return ManifestGroup( - arrays=manifest_dict, - attributes=attrs, - ) - - def _find_var_tags(self, root: ET.Element) -> list[ET.Element]: - """ - Find all variable tags in the DMR++ file. Also known as array tags. - Tags are labeled with the DAP data type. E.g. , , - - Parameters - ---------- - root : ET.Element - The root element of the DMR++ file. - - Returns - ------- - list[ET.Element] - """ - vars_tags: list[ET.Element] = [] - for dap_dtype in self._DAP_NP_DTYPE: - vars_tags += root.findall(f"dap:{dap_dtype}", self._NS) - return vars_tags - - def _parse_dim(self, root: ET.Element, dim_index: int = 0) -> dict[str, int]: - """ - Parse single or tag - - If the tag has no name attribute, it is a phony dimension. E.g. --> {"phony_dim_0": 300} - If the tag has both name and size attributes, it is a regular dimension. E.g. --> {"lat": 1447} - - Parameters - ---------- - root : ET.Element - The root element Dim/Dimension tag - dim_index : int - Index of the dimension, used for naming phony dimensions - - Returns - ------- - dict - E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim_0": 300}, {"time": None, "lat": None, "lon": None} - """ - size_attr = self._get_attrib(root, "size") - name_attr = self._get_attrib(root, "name") - - if size_attr is not None: - size = int(size_attr) - if name_attr is not None: - return {Path(name_attr).name: size} - else: - return {f"phony_dim_{dim_index}": size} - - raise ValueError("Not enough information to parse Dim/Dimension tag") - - def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]: - """ - Find the all tags with dimension information. - - First attempts to find Dimension tags, then falls back to Dim tags. - If Dim tags are found, the fully qualified name is used to find the corresponding Dimension tag. - If Dim tags have no name attribute, they are phony dimensions and used directly. - - Parameters - ---------- - root : ET.Element - An ElementTree Element from a DMR++ file. - - Returns - ------- - list[ET.Element] - """ - dimension_tags = root.findall("dap:Dimension", self._NS) - if not dimension_tags: - # Dim tags contain a fully qualified name that references a Dimension tag elsewhere in the DMR++ - # or they are phony dimensions (have size but no name) - dim_tags = root.findall("dap:Dim", self._NS) - for d in dim_tags: - dim_name = self._get_attrib(d, "name") - if dim_name is not None: - dimension_tag = self.find_node_fqn(dim_name) - if dimension_tag is not None: - dimension_tags.append(dimension_tag) - else: - # Phony dimension - use the Dim tag directly - dimension_tags.append(d) - return dimension_tags - - def _parse_variable(self, var_tag: ET.Element) -> ManifestArray: - """ - Parse a variable from a DMR++ tag. - - Parameters - ---------- - var_tag : ET.Element - An ElementTree Element representing a variable in the DMR++ file. Will have DAP dtype as tag. E.g. - - Returns - ------- - ManifestArray - """ - - # Dimension info - dims: dict[str, int] = {} - dimension_tags = self._find_dimension_tags(var_tag) - for dim_index, dim in enumerate(dimension_tags): - dims.update(self._parse_dim(dim, dim_index=dim_index)) - # convert DAP dtype to numpy dtype - dtype = np.dtype( - self._DAP_NP_DTYPE[var_tag.tag.removeprefix("{" + self._NS["dap"] + "}")] - ) - # Chunks and Filters - shape: tuple[int, ...] = tuple(dims.values()) - chunks_shape = shape - chunks_tag = var_tag.find("dmrpp:chunks", self._NS) - array_fill_value = np.array(0).astype(dtype)[()] - if chunks_tag is not None: - # Chunks - chunk_dim_text = chunks_tag.findtext( - "dmrpp:chunkDimensionSizes", namespaces=self._NS - ) - if chunk_dim_text is not None: - # 1 1447 2895 -> (1, 1447, 2895) - chunks_shape = tuple(map(int, chunk_dim_text.split())) - else: - chunks_shape = shape - fill_value = self._get_attrib(chunks_tag, "fillValue") - if fill_value is not None: - array_fill_value = np.array(fill_value).astype(dtype)[()] - if chunks_shape: - chunkmanifest = self._parse_chunks(chunks_tag, chunks_shape) - else: - chunkmanifest = ChunkManifest(entries={}, shape=array_fill_value.shape) - # Filters - codecs = self._parse_filters(chunks_tag, dtype) - - # Attributes - attrs: dict[str, Any] = {} - for attr_tag in var_tag.iterfind("dap:Attribute", self._NS): - attrs.update(self._parse_attribute(attr_tag)) - if "_FillValue" in attrs: - encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype) - attrs["_FillValue"] = encoded_cf_fill_value - - metadata = create_v3_array_metadata( - shape=shape, - data_type=dtype, - chunk_shape=chunks_shape, - codecs=codecs, - dimension_names=dims, - attributes=attrs, - fill_value=array_fill_value, - ) - return ManifestArray(metadata=metadata, chunkmanifest=chunkmanifest) - - def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]: - """ - Parse an attribute from a DMR++ attr tag. Converts the attribute value to a native python type. - Raises an exception if nested attributes are passed. Container attributes must be unwrapped in the parent function. - - Parameters - ---------- - attr_tag : ET.Element - An ElementTree Element with an tag. - - Returns - ------- - dict - """ - attr: dict[str, Any] = {} - values = [] - attr_type = self._get_attrib(attr_tag, "type") - attr_name = self._get_attrib(attr_tag, "name", required=True) - - if attr_name is None: - return {} - - if attr_type == "Container": - # DMR++ build information that is not part of the dataset - if attr_name == "build_dmrpp_metadata": - return {} - else: - warnings.warn( - "This DMRpp contains a nested attribute " - f"{attr_name}. Nested attributes cannot " - "be assigned to a variable or dataset and will be dropped" - ) - return {} - - if attr_type is None: - return {} - - dtype = np.dtype(self._DAP_NP_DTYPE[attr_type]) - # if multiple Value tags are present, store as "key": "[v1, v2, ...]" - for value_tag in attr_tag: - # cast attribute to native python type using dmr provided dtype - val = ( - dtype.type(value_tag.text).item() - if dtype != np.object_ - else value_tag.text - ) - # "*" may represent nan values in DMR++ - if val == "*": - val = np.nan - values.append(val) - attr[attr_name] = values[0] if len(values) == 1 else values - return attr - - def _parse_filters( - self, chunks_tag: ET.Element, dtype: np.dtype - ) -> list[dict] | None: - """ - Parse filters from a DMR++ chunks tag. - - Parameters - ---------- - chunks_tag : ET.Element - An ElementTree Element with a tag. - - dtype : np.dtype - The numpy dtype of the variable. - - Returns - ------- - list[dict] | None - E.g. [{"id": "shuffle", "elementsize": 4}, {"id": "zlib", "level": 4}] - """ - compression_type = self._get_attrib(chunks_tag, "compressionType") - if compression_type is not None: - filters: list[dict] = [] - # shuffle deflate --> ["shuffle", "deflate"] - compression_types = compression_type.split(" ") - for c in compression_types: - if c == "shuffle": - filters.append( - { - "name": "numcodecs.shuffle", - "configuration": {"elementsize": dtype.itemsize}, - } - ) - elif c == "deflate": - deflate_level = self._get_attrib(chunks_tag, "deflateLevel") - level = ( - int(deflate_level) - if deflate_level is not None - else self._DEFAULT_ZLIB_VALUE - ) - filters.append( - { - "name": "numcodecs.zlib", - "configuration": { - "level": level, - }, - } - ) - return filters - return None - - def _parse_chunks( - self, chunks_tag: ET.Element, chunks_shape: tuple[int, ...] - ) -> ChunkManifest: - """ - Parse the chunk manifest from a DMR++ chunks tag. - - Parameters - ---------- - chunks_tag : ET.Element - An ElementTree Element with a tag. - - chunks_shape : tuple - Chunk sizes for each dimension. E.g. (1, 1447, 2895) - - Returns - ------- - ChunkManifest - """ - chunkmanifest: dict[ChunkKey, object] = {} - default_num: list[int] = ( - [0 for i in range(len(chunks_shape))] if chunks_shape else [0] - ) - chunk_key_template = ".".join(["{}" for i in range(len(default_num))]) - for chunk_tag in chunks_tag.iterfind("dmrpp:chunk", self._NS): - chunk_num = default_num - chunk_position = self._get_attrib(chunk_tag, "chunkPositionInArray") - if chunk_position is not None: - # "[0,1023,10235]" -> ["0","1023","10235"] - chunk_pos = chunk_position[1:-1].split(",") - # [0,1023,10235] // [1, 1023, 2047] -> [0,1,5] - chunk_num = [ - int(chunk_pos[i]) // chunks_shape[i] - for i in range(len(chunks_shape)) - ] - # [0,1,5] -> "0.1.5" - chunk_key = ChunkKey(chunk_key_template.format(*chunk_num)) - offset = self._get_attrib(chunk_tag, "offset", required=True) - n_bytes = self._get_attrib(chunk_tag, "nBytes", required=True) - if offset is not None and n_bytes is not None: - chunkmanifest[chunk_key] = { - "path": self.data_filepath, - "offset": int(offset), - "length": int(n_bytes), - } - return ChunkManifest(entries=chunkmanifest) From 05e493554e0310b2b1317cf06dd2acd3828f3482 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Wed, 4 Mar 2026 17:03:03 -0800 Subject: [PATCH 03/22] update DMRParser class to allow for on-the-fly dmrpps --- virtualizarr/parsers/dmrpp.py | 101 +++++++++++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 3 deletions(-) diff --git a/virtualizarr/parsers/dmrpp.py b/virtualizarr/parsers/dmrpp.py index 2887e6bb..f0a7f53d 100644 --- a/virtualizarr/parsers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -1,13 +1,22 @@ import io +import warnings +from pathlib import Path from typing import Iterable from xml.etree import ElementTree as ET +from obspec_utils.protocols import ReadableStore from obspec_utils.readers import EagerStoreReader from obspec_utils.registry import ObjectStoreRegistry +from pydap.parsers.dmr import DMRPPParser as _DMRPPParser from virtualizarr.manifests import ( + ChunkManifest, + ManifestArray, + ManifestGroup, ManifestStore, ) +from virtualizarr.manifests.utils import create_v3_array_metadata +from virtualizarr.parsers.utils import encode_cf_fill_value class DMRPPParser: @@ -55,9 +64,15 @@ def __call__( file_bytes = reader.readall() stream = io.BytesIO(file_bytes) + url = ( + url.removesuffix(".dap.dmrpp") + if url.endswith(".dap.dmrpp") + else url.removesuffix(".dmrpp") + ) + parser = DMRParser( root=ET.parse(stream).getroot(), - data_filepath=url.removesuffix(".dmrpp"), + data_filepath=url, skip_variables=self.skip_variables, ) manifest_store = parser.parse_dataset(object_store=store, group=self.group) @@ -101,6 +116,86 @@ def __init__( If None, the data file path is taken from the DMR++ file. """ self.root = root - self._validation_issues: list[str] = [] - self.data_filepath = data_filepath + self.data_filepath = ( + data_filepath if data_filepath is not None else self.root.attrib["name"] + ) self.skip_variables = skip_variables or () + + def dmrparser(self) -> _DMRPPParser: + """Exposes the _DMRParser to external use (avoids breaking changes)""" + return _DMRPPParser( + root=self.root, + data_filepath=self.data_filepath, + skip_variables=self.skip_variables, + ) + + def parse_dataset( + self, + object_store: ReadableStore, + group: str | None = None, + ) -> ManifestStore: + """ + Parses the given file and creates a ManifestStore. + + Parameters + ---------- + group + The group to parse. Ignored if no groups are present, and the entire + dataset is parsed. If `None` or "/", and groups are present, the first group + is parsed. If not `None` or "/", and no groups are present, a UserWarning + is issued indicating that the group will be ignored. + + Returns + ------- + ManifestStore + + Examples + -------- + Open a sample DMR++ file and parse the dataset + """ + group = group or "/" + ngroups = len(self.root.findall("dap:Group", self._NS)) + + if ngroups == 0 and group != "/": + warnings.warn( + f"No groups in DMR++ file {self.data_filepath!r}; " + f"ignoring group parameter {group!r}" + ) + + group_path = Path("/") if ngroups == 0 else Path("/") / group.removeprefix("/") + + dataset_element = self.dmrparser()._split_groups(self.root).get(group_path) + + if dataset_element is None: + raise ValueError( + f"Group {group_path} not found in DMR++ file {self.data_filepath!r}" + ) + + # get two dictionary containing relevant metadata + vars_dict, attrs = self.dmrparser()._parse_dataset(dataset_element) + + manifest_dict: dict[str, ManifestArray] = {} + + for var in vars_dict.keys(): + chunkmanifest = ChunkManifest(vars_dict[var].pop("chunkmanifest", None)) + meta = dict( + [ + (k, v) + for k, v in vars_dict[var].items() + if k not in ["Maps", "fqn_dims"] + ] + ) + if "_FillValue" in meta["attributes"]: + encoded_cf_fill_value = encode_cf_fill_value( + meta["attributes"]["_FillValue"], meta["data_type"] + ) + meta["attributes"]["_FillValue"] = encoded_cf_fill_value + metadata = create_v3_array_metadata(**meta) + manifest_dict[var] = ManifestArray( + metadata=metadata, chunkmanifest=chunkmanifest + ) + manifest_group = ManifestGroup(arrays=manifest_dict, attributes=attrs) + registry = ObjectStoreRegistry() + registry.register(self.data_filepath, object_store) + + return ManifestStore(registry=registry, group=manifest_group) From 7ef4b1565bf52f23552c0a41e6c5a194d6bd010f Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Wed, 4 Mar 2026 17:06:01 -0800 Subject: [PATCH 04/22] add pydap as optional dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 827e60f4..90d0031b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ remote = [ "requests", "aiohttp", "s3fs", + "pydap @ git+https://github.com/pydap/pydap.git", ] # non-kerchunk-based parsers From 278a3df6f7b37468c5d004f66f897de446facd10 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Wed, 4 Mar 2026 17:38:09 -0800 Subject: [PATCH 05/22] comment tests as these are migrated --- virtualizarr/tests/test_parsers/test_dmrpp.py | 161 ++++++++++-------- 1 file changed, 87 insertions(+), 74 deletions(-) diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index 01646250..ac4f562b 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -1,7 +1,6 @@ import platform import textwrap from contextlib import nullcontext -from pathlib import Path from xml.etree import ElementTree as ET import pytest @@ -435,56 +434,56 @@ def test_NASA_dmrpp_load(data_url, dmrpp_url): assert ds.load() -@pytest.mark.parametrize( - "fqn_path, expected_xpath", - [ - ("/", "."), - ("/air", "./*[@name='air']"), - ], -) -def test_find_node_fqn_simple(netcdf4_file, fqn_path, expected_xpath): - parser_instance = dmrparser( - DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file - ) - result = parser_instance.find_node_fqn(fqn_path) - expected = parser_instance.root.find(expected_xpath, parser_instance._NS) - assert result == expected - - -def test_find_node_fqn_grouped(hdf5_groups_file): - parser_instance = dmrparser( - DMRPP_XML_STRINGS["hdf5_groups_file"], filepath=hdf5_groups_file - ) - result = parser_instance.find_node_fqn("/test/group/air") - expected = parser_instance.root.find( - "./*[@name='test']/*[@name='group']/*[@name='air']", parser_instance._NS - ) - assert result == expected - - -@pytest.mark.parametrize( - "group_path", - [ - ("/"), - ("/test"), - ("/test/group"), - ], -) -def test_split_groups(hdf5_groups_file, group_path): - dmrpp_instance = dmrparser( - DMRPP_XML_STRINGS["hdf5_groups_file"], filepath=hdf5_groups_file - ) - - # get all tags in a dataset (so all tags excluding nested groups) - dataset_tags = lambda x: [ - d for d in x if d.tag != "{" + dmrpp_instance._NS["dap"] + "}" + "Group" - ] - # check that contents of the split groups dataset match contents of the original dataset - result_tags = dataset_tags( - dmrpp_instance._split_groups(dmrpp_instance.root)[Path(group_path)] - ) - expected_tags = dataset_tags(dmrpp_instance.find_node_fqn(group_path)) - assert result_tags == expected_tags +# @pytest.mark.parametrize( +# "fqn_path, expected_xpath", +# [ +# ("/", "."), +# ("/air", "./*[@name='air']"), +# ], +# ) +# def test_find_node_fqn_simple(netcdf4_file, fqn_path, expected_xpath): +# parser_instance = dmrparser( +# DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file +# ) +# result = parser_instance.find_node_fqn(fqn_path) +# expected = parser_instance.root.find(expected_xpath, parser_instance._NS) +# assert result == expected + + +# def test_find_node_fqn_grouped(hdf5_groups_file): +# parser_instance = dmrparser( +# DMRPP_XML_STRINGS["hdf5_groups_file"], filepath=hdf5_groups_file +# ) +# result = parser_instance.find_node_fqn("/test/group/air") +# expected = parser_instance.root.find( +# "./*[@name='test']/*[@name='group']/*[@name='air']", parser_instance._NS +# ) +# assert result == expected + + +# @pytest.mark.parametrize( +# "group_path", +# [ +# ("/"), +# ("/test"), +# ("/test/group"), +# ], +# ) +# def test_split_groups(hdf5_groups_file, group_path): +# dmrpp_instance = dmrparser( +# DMRPP_XML_STRINGS["hdf5_groups_file"], filepath=hdf5_groups_file +# ) + +# # get all tags in a dataset (so all tags excluding nested groups) +# dataset_tags = lambda x: [ +# d for d in x if d.tag != "{" + dmrpp_instance._NS["dap"] + "}" + "Group" +# ] +# # check that contents of the split groups dataset match contents of the original dataset +# result_tags = dataset_tags( +# dmrpp_instance._split_groups(dmrpp_instance.root)[Path(group_path)] +# ) +# expected_tags = dataset_tags(dmrpp_instance.find_node_fqn(group_path)) +# assert result_tags == expected_tags @pytest.mark.xfail( @@ -558,9 +557,10 @@ def test_parse_dataset_nested(hdf5_groups_file): assert vds_g2.data_vars["air"].dims == ("time", "lat", "lon") -def test_parse_variable(netcdf4_file): - parser = dmrparser(DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file) +# def test_parse_variable(netcdf4_file): +# parser = dmrparser(DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file) +<<<<<<< HEAD var = parser._parse_variable(parser.find_node_fqn("/air")) assert var.metadata.dtype.to_native_dtype() == "int16" assert var.metadata.dimension_names == ("time", "lat", "lon") @@ -572,20 +572,33 @@ def test_parse_variable(netcdf4_file): var.metadata.attributes["long_name"] == "4xDaily Air temperature at sigma level 995" ) - - -@pytest.mark.parametrize( - "attr_path, expected", - [ - ("air/long_name", {"long_name": "4xDaily Air temperature at sigma level 995"}), - ("air/scale_factor", {"scale_factor": 0.01}), - ], -) -def test_parse_attribute(netcdf4_file, attr_path, expected): - parser = dmrparser(DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file) - - result = parser._parse_attribute(parser.find_node_fqn(attr_path)) - assert result == expected +======= +# var = parser._parse_variable(parser.find_node_fqn("/air")) +# assert var.metadata.dtype.to_native_dtype() == "int16" +# assert var.metadata.dimension_names == ("time", "lat", "lon") +# assert var.shape == (2920, 25, 53) +# assert var.chunks == (2920, 25, 53) +# # _FillValue is encoded for array dtype +# assert var.metadata.attributes["scale_factor"] == 0.01 +# assert ( +# var.metadata.attributes["long_name"] +# == "4xDaily Air temperature at sigma level 995" +# ) +>>>>>>> 2536cb0 (comment tests as these are migrated) + + +# @pytest.mark.parametrize( +# "attr_path, expected", +# [ +# ("air/long_name", {"long_name": "4xDaily Air temperature at sigma level 995"}), +# ("air/scale_factor", {"scale_factor": 0.01}), +# ], +# ) +# def test_parse_attribute(netcdf4_file, attr_path, expected): +# parser = dmrparser(DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file) + +# result = parser._parse_attribute(parser.find_node_fqn(attr_path)) +# assert result == expected def test_dmrpp_simple(dmrpp_xml_simple): @@ -634,14 +647,14 @@ def test_dmrpp_missing_attrib_validation(dmrpp_xml_with_missing_attrib): object_store=obstore_local(url="file:///"), group="/" ) - # Verify that validation issues were accumulated - assert len(parser._validation_issues) > 0 + # # Verify that validation issues were accumulated + # assert len(parser._validation_issues) > 0 - # Check that the issues mention missing attributes - assert any( - "Missing required attribute 'name'" in issue - for issue in parser._validation_issues - ) + # # Check that the issues mention missing attributes + # assert any( + # "Missing required attribute 'name'" in issue + # for issue in parser._validation_issues + # ) # Verify manifest store was still created (parser continues despite issues) assert manifest_store is not None From d01b22b03587b8e2b7b36b400e8f6b3f90a06e03 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Wed, 4 Mar 2026 17:40:47 -0800 Subject: [PATCH 06/22] raise NotImplementError if inline values --- virtualizarr/parsers/dmrpp.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/virtualizarr/parsers/dmrpp.py b/virtualizarr/parsers/dmrpp.py index f0a7f53d..afa359d5 100644 --- a/virtualizarr/parsers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -190,6 +190,14 @@ def parse_dataset( meta["attributes"]["_FillValue"], meta["data_type"] ) meta["attributes"]["_FillValue"] = encoded_cf_fill_value + + if "inline" in meta: + raise NotImplementedError( + "Reading inlined reference data is currently not supported." + "See https://github.com/zarr-developers/VirtualiZarr/issues/489", + ) + meta.pop("inline", None) + metadata = create_v3_array_metadata(**meta) manifest_dict[var] = ManifestArray( metadata=metadata, chunkmanifest=chunkmanifest From 421097bc7522ac6e9b7abe21b2f942c83fd249d7 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 5 Mar 2026 10:47:30 -0800 Subject: [PATCH 07/22] install pydap from unreleased PR --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 90d0031b..25a14890 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ remote = [ "requests", "aiohttp", "s3fs", - "pydap @ git+https://github.com/pydap/pydap.git", + "pydap @ git+https://github.com/pydap/pydap.git@refs/pull/659/merge", ] # non-kerchunk-based parsers From 161d8312d16bd3e7dae7b45b44f47c6c0180d4cf Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Wed, 11 Mar 2026 10:04:39 -0700 Subject: [PATCH 08/22] rebase --- virtualizarr/tests/test_parsers/test_dmrpp.py | 36 ++++++------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index ac4f562b..8b7fe557 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -560,31 +560,17 @@ def test_parse_dataset_nested(hdf5_groups_file): # def test_parse_variable(netcdf4_file): # parser = dmrparser(DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file) -<<<<<<< HEAD - var = parser._parse_variable(parser.find_node_fqn("/air")) - assert var.metadata.dtype.to_native_dtype() == "int16" - assert var.metadata.dimension_names == ("time", "lat", "lon") - assert var.shape == (2920, 25, 53) - assert var.metadata.chunks == (2920, 25, 53) - # _FillValue is encoded for array dtype - assert var.metadata.attributes["scale_factor"] == 0.01 - assert ( - var.metadata.attributes["long_name"] - == "4xDaily Air temperature at sigma level 995" - ) -======= -# var = parser._parse_variable(parser.find_node_fqn("/air")) -# assert var.metadata.dtype.to_native_dtype() == "int16" -# assert var.metadata.dimension_names == ("time", "lat", "lon") -# assert var.shape == (2920, 25, 53) -# assert var.chunks == (2920, 25, 53) -# # _FillValue is encoded for array dtype -# assert var.metadata.attributes["scale_factor"] == 0.01 -# assert ( -# var.metadata.attributes["long_name"] -# == "4xDaily Air temperature at sigma level 995" -# ) ->>>>>>> 2536cb0 (comment tests as these are migrated) + # var = parser._parse_variable(parser.find_node_fqn("/air")) + # assert var.metadata.dtype.to_native_dtype() == "int16" + # assert var.metadata.dimension_names == ("time", "lat", "lon") + # assert var.shape == (2920, 25, 53) + # assert var.metadata.chunks == (2920, 25, 53) + # # _FillValue is encoded for array dtype + # assert var.metadata.attributes["scale_factor"] == 0.01 + # assert ( + # var.metadata.attributes["long_name"] + # == "4xDaily Air temperature at sigma level 995" + # ) # @pytest.mark.parametrize( From 0f5bf2f0edfeed63a7c809b437824c41e36023c5 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Wed, 11 Mar 2026 10:04:39 -0700 Subject: [PATCH 09/22] rebase --- conftest.py | 9 ++ virtualizarr/parsers/dmrpp.py | 10 ++- virtualizarr/tests/test_parsers/test_dmrpp.py | 87 ++++++------------- 3 files changed, 44 insertions(+), 62 deletions(-) diff --git a/conftest.py b/conftest.py index 2c9d1c10..1538e60c 100644 --- a/conftest.py +++ b/conftest.py @@ -356,6 +356,15 @@ def hdf5_empty(tmp_path: Path) -> str: return str(filepath) +@pytest.fixture +def hdf5_missing_value(tmp_path: Path) -> str: + """Create an empty HDF5 file.""" + filepath = tmp_path / "compact_lowlevel.h5" + with h5py.File(filepath, "w") as f: + f.create_dataset("my_dataset", shape=(10,), dtype="int64") + return str(filepath) + + @pytest.fixture def hdf5_scalar(tmp_path: Path) -> str: """Create an HDF5 file with a scalar dataset.""" diff --git a/virtualizarr/parsers/dmrpp.py b/virtualizarr/parsers/dmrpp.py index afa359d5..60ae7c15 100644 --- a/virtualizarr/parsers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -120,14 +120,17 @@ def __init__( data_filepath if data_filepath is not None else self.root.attrib["name"] ) self.skip_variables = skip_variables or () + self._validation_issues: list[str] = [] def dmrparser(self) -> _DMRPPParser: """Exposes the _DMRParser to external use (avoids breaking changes)""" - return _DMRPPParser( + parser = _DMRPPParser( root=self.root, data_filepath=self.data_filepath, skip_variables=self.skip_variables, ) + self._validation_issues = parser._validation_issues + return parser def parse_dataset( self, @@ -177,7 +180,7 @@ def parse_dataset( manifest_dict: dict[str, ManifestArray] = {} for var in vars_dict.keys(): - chunkmanifest = ChunkManifest(vars_dict[var].pop("chunkmanifest", None)) + chunkmanifest = vars_dict[var].pop("chunkmanifest", None) meta = dict( [ (k, v) @@ -193,11 +196,12 @@ def parse_dataset( if "inline" in meta: raise NotImplementedError( - "Reading inlined reference data is currently not supported." + "Reading inlined reference data is currently not supported. " "See https://github.com/zarr-developers/VirtualiZarr/issues/489", ) meta.pop("inline", None) + chunkmanifest = ChunkManifest(chunkmanifest) metadata = create_v3_array_metadata(**meta) manifest_dict[var] = ManifestArray( metadata=metadata, chunkmanifest=chunkmanifest diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index 8b7fe557..fcb6ab88 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -4,6 +4,7 @@ from xml.etree import ElementTree as ET import pytest +import requests import xarray as xr import xarray.testing as xrt from obspec_utils.registry import ObjectStoreRegistry @@ -434,58 +435,6 @@ def test_NASA_dmrpp_load(data_url, dmrpp_url): assert ds.load() -# @pytest.mark.parametrize( -# "fqn_path, expected_xpath", -# [ -# ("/", "."), -# ("/air", "./*[@name='air']"), -# ], -# ) -# def test_find_node_fqn_simple(netcdf4_file, fqn_path, expected_xpath): -# parser_instance = dmrparser( -# DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file -# ) -# result = parser_instance.find_node_fqn(fqn_path) -# expected = parser_instance.root.find(expected_xpath, parser_instance._NS) -# assert result == expected - - -# def test_find_node_fqn_grouped(hdf5_groups_file): -# parser_instance = dmrparser( -# DMRPP_XML_STRINGS["hdf5_groups_file"], filepath=hdf5_groups_file -# ) -# result = parser_instance.find_node_fqn("/test/group/air") -# expected = parser_instance.root.find( -# "./*[@name='test']/*[@name='group']/*[@name='air']", parser_instance._NS -# ) -# assert result == expected - - -# @pytest.mark.parametrize( -# "group_path", -# [ -# ("/"), -# ("/test"), -# ("/test/group"), -# ], -# ) -# def test_split_groups(hdf5_groups_file, group_path): -# dmrpp_instance = dmrparser( -# DMRPP_XML_STRINGS["hdf5_groups_file"], filepath=hdf5_groups_file -# ) - -# # get all tags in a dataset (so all tags excluding nested groups) -# dataset_tags = lambda x: [ -# d for d in x if d.tag != "{" + dmrpp_instance._NS["dap"] + "}" + "Group" -# ] -# # check that contents of the split groups dataset match contents of the original dataset -# result_tags = dataset_tags( -# dmrpp_instance._split_groups(dmrpp_instance.root)[Path(group_path)] -# ) -# expected_tags = dataset_tags(dmrpp_instance.find_node_fqn(group_path)) -# assert result_tags == expected_tags - - @pytest.mark.xfail( platform.system() == "Linux", reason="See https://github.com/zarr-developers/VirtualiZarr/issues/904.", @@ -557,6 +506,7 @@ def test_parse_dataset_nested(hdf5_groups_file): assert vds_g2.data_vars["air"].dims == ("time", "lat", "lon") +<<<<<<< HEAD # def test_parse_variable(netcdf4_file): # parser = dmrparser(DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file) @@ -587,6 +537,8 @@ def test_parse_dataset_nested(hdf5_groups_file): # assert result == expected +======= +>>>>>>> 79b9a79 (rebase) def test_dmrpp_simple(dmrpp_xml_simple): """Test parsing a simple valid DMR++ XML creates virtual chunk manifests.""" parser = dmrparser(dmrpp_xml_simple, filepath="file:///simple.nc") @@ -633,15 +585,32 @@ def test_dmrpp_missing_attrib_validation(dmrpp_xml_with_missing_attrib): object_store=obstore_local(url="file:///"), group="/" ) - # # Verify that validation issues were accumulated - # assert len(parser._validation_issues) > 0 + # Verify that validation issues were accumulated + assert len(parser._validation_issues) > 0 - # # Check that the issues mention missing attributes - # assert any( - # "Missing required attribute 'name'" in issue - # for issue in parser._validation_issues - # ) + # Check that the issues mention missing attributes + assert any( + "Missing required attribute 'name'" in issue + for issue in parser._validation_issues + ) # Verify manifest store was still created (parser continues despite issues) assert manifest_store is not None assert manifest_store._group is not None + + +@requires_network +def test_nonimplement_inlinevalue(hdf5_missing_value): + dmrpp_file = ( + "http://test.opendap.org/opendap/data/dmrpp/compact_lowlevel.h5.dmrpp.file" + ) + session = requests.Session() + dmrpp = session.get(dmrpp_file).content.decode() + parser = dmrparser(dmrpp, filepath=f"file:///{hdf5_missing_value}") + store = obstore_local(url=parser.data_filepath) + + with pytest.raises( + NotImplementedError, + match="Reading inlined reference data is currently not supported", + ): + parser.parse_dataset(object_store=store) From c04f1278fc5b94f6cd435777e4479ef26cc1c481 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Wed, 11 Mar 2026 10:07:03 -0700 Subject: [PATCH 10/22] rebase --- docs/releases.md | 1 + pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/releases.md b/docs/releases.md index f28cc441..ecc99096 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -249,6 +249,7 @@ Brings `region`-writing support in `.to_icechunk()`, a `ZarrParser` with orders - Fix `ZarrParser` to use public attribute instead of private one ([#916](https://github.com/zarr-developers/VirtualiZarr/pull/916)). By [Max Jones](https://github.com/maxrjones). + ### Documentation - Added FAQ answer comparing the Kerchunk and Icechunk serialization formats. ([#818](https://github.com/zarr-developers/VirtualiZarr/pull/818)). diff --git a/pyproject.toml b/pyproject.toml index 25a14890..26cc2406 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ remote = [ "requests", "aiohttp", "s3fs", - "pydap @ git+https://github.com/pydap/pydap.git@refs/pull/659/merge", + "pydap>=3.5.9", ] # non-kerchunk-based parsers From 1df2b87c789b32cc331242f3427c2f941e0553b1 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 5 Mar 2026 16:44:45 -0800 Subject: [PATCH 11/22] fix mypy --- pyproject.toml | 1 + virtualizarr/parsers/dmrpp.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 26cc2406..50d31181 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -268,6 +268,7 @@ module = [ "ujson", "zarr", "requests", + "pydap.*", ] ignore_missing_imports = true diff --git a/virtualizarr/parsers/dmrpp.py b/virtualizarr/parsers/dmrpp.py index 60ae7c15..5c672d19 100644 --- a/virtualizarr/parsers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -207,7 +207,7 @@ def parse_dataset( metadata=metadata, chunkmanifest=chunkmanifest ) manifest_group = ManifestGroup(arrays=manifest_dict, attributes=attrs) - registry = ObjectStoreRegistry() + registry: ObjectStoreRegistry = ObjectStoreRegistry() registry.register(self.data_filepath, object_store) return ManifestStore(registry=registry, group=manifest_group) From c6b11ff393194a9f455a303e09400e43ac739be2 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Wed, 11 Mar 2026 10:37:14 -0700 Subject: [PATCH 12/22] skip test on macOS too --- virtualizarr/tests/test_parsers/test_dmrpp.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index fcb6ab88..946a786f 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -1,4 +1,3 @@ -import platform import textwrap from contextlib import nullcontext from xml.etree import ElementTree as ET @@ -436,7 +435,6 @@ def test_NASA_dmrpp_load(data_url, dmrpp_url): @pytest.mark.xfail( - platform.system() == "Linux", reason="See https://github.com/zarr-developers/VirtualiZarr/issues/904.", ) @pytest.mark.parametrize( From 4f3763adf45557a86c4e646ecd923be996ca3eb8 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Mon, 4 May 2026 12:43:31 -0700 Subject: [PATCH 13/22] update how inline references are handled v1 --- virtualizarr/parsers/dmrpp.py | 16 +++++++++++----- virtualizarr/tests/test_parsers/test_dmrpp.py | 14 ++++++-------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/virtualizarr/parsers/dmrpp.py b/virtualizarr/parsers/dmrpp.py index 5c672d19..9fecfa1b 100644 --- a/virtualizarr/parsers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -15,6 +15,7 @@ ManifestGroup, ManifestStore, ) +from virtualizarr.manifests.manifest import ChunkEntry from virtualizarr.manifests.utils import create_v3_array_metadata from virtualizarr.parsers.utils import encode_cf_fill_value @@ -181,6 +182,7 @@ def parse_dataset( for var in vars_dict.keys(): chunkmanifest = vars_dict[var].pop("chunkmanifest", None) + # remove opendap-related metadata meta = dict( [ (k, v) @@ -195,17 +197,21 @@ def parse_dataset( meta["attributes"]["_FillValue"] = encoded_cf_fill_value if "inline" in meta: - raise NotImplementedError( - "Reading inlined reference data is currently not supported. " - "See https://github.com/zarr-developers/VirtualiZarr/issues/489", + # chunkmanifest is empty - extract inline data from dict + data = meta.pop("inline", None) + shape = meta.pop("shape", None) + chunk_entry = ChunkEntry( + path="__inlined__", offset=0, length=len(data), data=data ) - meta.pop("inline", None) + chunkmanifest = ChunkManifest(entries=chunk_entry, shape=shape) + else: + chunkmanifest = ChunkManifest(chunkmanifest) - chunkmanifest = ChunkManifest(chunkmanifest) metadata = create_v3_array_metadata(**meta) manifest_dict[var] = ManifestArray( metadata=metadata, chunkmanifest=chunkmanifest ) + manifest_group = ManifestGroup(arrays=manifest_dict, attributes=attrs) registry: ObjectStoreRegistry = ObjectStoreRegistry() registry.register(self.data_filepath, object_store) diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index 946a786f..d0c77c10 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -597,18 +597,16 @@ def test_dmrpp_missing_attrib_validation(dmrpp_xml_with_missing_attrib): assert manifest_store._group is not None +@pytest.mark.xfail( + reason="inline reference is failing locally", +) @requires_network -def test_nonimplement_inlinevalue(hdf5_missing_value): +def test_inlinevalue(): dmrpp_file = ( "http://test.opendap.org/opendap/data/dmrpp/compact_lowlevel.h5.dmrpp.file" ) session = requests.Session() dmrpp = session.get(dmrpp_file).content.decode() - parser = dmrparser(dmrpp, filepath=f"file:///{hdf5_missing_value}") + parser = dmrparser(dmrpp, filepath="file:///") store = obstore_local(url=parser.data_filepath) - - with pytest.raises( - NotImplementedError, - match="Reading inlined reference data is currently not supported", - ): - parser.parse_dataset(object_store=store) + parser.parse_dataset(object_store=store) From 8dccd228378f47cbd92fd140df1fe95001c5c6a7 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Mon, 4 May 2026 13:48:52 -0700 Subject: [PATCH 14/22] update how inline references are handled v2 --- virtualizarr/parsers/dmrpp.py | 24 +++++++++++++------ virtualizarr/tests/test_parsers/test_dmrpp.py | 13 ++++++---- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/virtualizarr/parsers/dmrpp.py b/virtualizarr/parsers/dmrpp.py index 9fecfa1b..5c5ae0a9 100644 --- a/virtualizarr/parsers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -1,3 +1,4 @@ +import base64 import io import warnings from pathlib import Path @@ -15,7 +16,6 @@ ManifestGroup, ManifestStore, ) -from virtualizarr.manifests.manifest import ChunkEntry from virtualizarr.manifests.utils import create_v3_array_metadata from virtualizarr.parsers.utils import encode_cf_fill_value @@ -197,13 +197,23 @@ def parse_dataset( meta["attributes"]["_FillValue"] = encoded_cf_fill_value if "inline" in meta: - # chunkmanifest is empty - extract inline data from dict + # extract data already decoded into array/string data = meta.pop("inline", None) - shape = meta.pop("shape", None) - chunk_entry = ChunkEntry( - path="__inlined__", offset=0, length=len(data), data=data - ) - chunkmanifest = ChunkManifest(entries=chunk_entry, shape=shape) + bdata = base64.b64encode(data) + # chunk_entry = ChunkEntry( + # path="", offset=0, length=len(bdata), data=bdata + # ) + # chunkmanifest = ChunkManifest(entries=chunk_entry) + + chunks = { + "0.0": { + "path": "__inline__", + "offset": 0, + "length": len(bdata), + "data": bdata, + }, + } + chunkmanifest = ChunkManifest(entries=chunks) else: chunkmanifest = ChunkManifest(chunkmanifest) diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index d0c77c10..5e382197 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -597,11 +597,13 @@ def test_dmrpp_missing_attrib_validation(dmrpp_xml_with_missing_attrib): assert manifest_store._group is not None -@pytest.mark.xfail( - reason="inline reference is failing locally", -) @requires_network def test_inlinevalue(): + """ + Test that inline values can be parsed into manifest + """ + expected_bytes = b"AAAAAAAAAAABAAAAAAAAAAIAAAAAAAAAAwAAAAAAAAAEAAAAAAAAAAUAAAAAAAAABgAAAAAAAAAHAAAAAAAAAAgAAAAAAAAACQAAAAAAAAA=" + dmrpp_file = ( "http://test.opendap.org/opendap/data/dmrpp/compact_lowlevel.h5.dmrpp.file" ) @@ -609,4 +611,7 @@ def test_inlinevalue(): dmrpp = session.get(dmrpp_file).content.decode() parser = dmrparser(dmrpp, filepath="file:///") store = obstore_local(url=parser.data_filepath) - parser.parse_dataset(object_store=store) + ms = parser.parse_dataset(object_store=store) + vds = ms.to_virtual_dataset() + assert "my_dataset" in vds + assert ms._group["my_dataset"]._manifest._inlined == {(0, 0): expected_bytes} From cf838b742f63613649d356d15c075c9272775882 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Mon, 4 May 2026 14:45:42 -0700 Subject: [PATCH 15/22] add pydap to min-deps via `dev` - avoid min-deps ci/cd failure --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 50d31181..74b93e05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -123,6 +123,7 @@ dev = [ "pandas-stubs", "pooch", "pre-commit", + "pydap>=3.5.9" "pytest", "pytest-asyncio", "pytest-cov", From 9a85bfdb07c5e3f1aca2aa5e4afe7eb890c62a68 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Mon, 4 May 2026 14:51:29 -0700 Subject: [PATCH 16/22] fix missing `,` --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 74b93e05..e33eb88c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -123,7 +123,7 @@ dev = [ "pandas-stubs", "pooch", "pre-commit", - "pydap>=3.5.9" + "pydap>=3.5.9", "pytest", "pytest-asyncio", "pytest-cov", From c1fec298407807158687dd8febbcd00b1d7f837a Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Tue, 9 Jun 2026 09:47:03 -0700 Subject: [PATCH 17/22] add a separate isolated opt-dep entry for dmrpp --- pyproject.toml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e33eb88c..ec4bfea4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,6 @@ remote = [ "requests", "aiohttp", "s3fs", - "pydap>=3.5.9", ] # non-kerchunk-based parsers @@ -51,6 +50,9 @@ hdf = [ "imagecodecs-numcodecs==2024.6.1", ] +# dmrpp +dmrpp = ["pydap>=3.5.10"] + zarr = ["arro3-core"] # kerchunk-based parsers @@ -84,7 +86,8 @@ all_parsers = [ "virtualizarr[kerchunk_parquet]", "virtualizarr[tiff]", "virtualizarr[grib]", - "virtualizarr[zarr]" + "virtualizarr[zarr]", + "virtualizarr[dmrpp]", ] # writers @@ -123,7 +126,6 @@ dev = [ "pandas-stubs", "pooch", "pre-commit", - "pydap>=3.5.9", "pytest", "pytest-asyncio", "pytest-cov", From 2d08220eafb2941dd8916ab234708510b2b408d5 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 12 Jun 2026 16:30:13 -0700 Subject: [PATCH 18/22] update dmrpp migration - soft imports and code refactor --- pyproject.toml | 4 +- virtualizarr/parsers/dmrpp.py | 169 +------- virtualizarr/tests/__init__.py | 2 + virtualizarr/tests/test_parsers/test_dmrpp.py | 374 +----------------- 4 files changed, 23 insertions(+), 526 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ec4bfea4..7567ff4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,9 @@ hdf = [ ] # dmrpp -dmrpp = ["pydap>=3.5.10"] +dmrpp = [ + "pydap @ git+https://github.com/pydap/pydap.git@refs/pull/697/head", +] zarr = ["arro3-core"] diff --git a/virtualizarr/parsers/dmrpp.py b/virtualizarr/parsers/dmrpp.py index 5c5ae0a9..29e183b9 100644 --- a/virtualizarr/parsers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -1,23 +1,15 @@ -import base64 import io -import warnings -from pathlib import Path from typing import Iterable from xml.etree import ElementTree as ET -from obspec_utils.protocols import ReadableStore +# from obspec_utils.protocols import ReadableStore from obspec_utils.readers import EagerStoreReader from obspec_utils.registry import ObjectStoreRegistry -from pydap.parsers.dmr import DMRPPParser as _DMRPPParser -from virtualizarr.manifests import ( - ChunkManifest, - ManifestArray, - ManifestGroup, - ManifestStore, -) -from virtualizarr.manifests.utils import create_v3_array_metadata -from virtualizarr.parsers.utils import encode_cf_fill_value +from virtualizarr.manifests import ManifestStore +from virtualizarr.utils import soft_import + +pydap = soft_import("pydap", "parsing dmrpp references", strict=False) class DMRPPParser: @@ -71,6 +63,8 @@ def __call__( else url.removesuffix(".dmrpp") ) + from pydap.virtualizarr.parser import DMRParser + parser = DMRParser( root=ET.parse(stream).getroot(), data_filepath=url, @@ -78,152 +72,3 @@ def __call__( ) manifest_store = parser.parse_dataset(object_store=store, group=self.group) return manifest_store - - -class DMRParser: - """ - Parser for the OPeNDAP DMR++ XML format. - Reads groups, dimensions, coordinates, data variables, encoding, chunk manifests, and attributes. - Highly modular to allow support for older dmrpp schema versions. Includes many utility functions to extract - different information such as finding all variable tags, splitting hdf5 groups, parsing dimensions, and more. - - OPeNDAP DMR++ homepage: https://docs.opendap.org/index.php/DMR%2B%2B - """ - - # DAP and DMRPP XML namespaces - _NS = { - "dap": "http://xml.opendap.org/ns/DAP/4.0#", - "dmrpp": "http://xml.opendap.org/dap/dmrpp/1.0.0#", - } - - root: ET.Element - data_filepath: str - - def __init__( - self, - root: ET.Element, - data_filepath: str | None = None, - skip_variables: Iterable[str] | None = None, - ): - """ - Initialize the DMRParser with the given DMR++ file contents and source data file path. - - Parameters - ---------- - root - Root of the xml tree structure of a DMR++ file. - data_filepath - The path to the actual data file that will be set in the chunk manifests. - If None, the data file path is taken from the DMR++ file. - """ - self.root = root - self.data_filepath = ( - data_filepath if data_filepath is not None else self.root.attrib["name"] - ) - self.skip_variables = skip_variables or () - self._validation_issues: list[str] = [] - - def dmrparser(self) -> _DMRPPParser: - """Exposes the _DMRParser to external use (avoids breaking changes)""" - parser = _DMRPPParser( - root=self.root, - data_filepath=self.data_filepath, - skip_variables=self.skip_variables, - ) - self._validation_issues = parser._validation_issues - return parser - - def parse_dataset( - self, - object_store: ReadableStore, - group: str | None = None, - ) -> ManifestStore: - """ - Parses the given file and creates a ManifestStore. - - Parameters - ---------- - group - The group to parse. Ignored if no groups are present, and the entire - dataset is parsed. If `None` or "/", and groups are present, the first group - is parsed. If not `None` or "/", and no groups are present, a UserWarning - is issued indicating that the group will be ignored. - - Returns - ------- - ManifestStore - - Examples - -------- - Open a sample DMR++ file and parse the dataset - """ - group = group or "/" - ngroups = len(self.root.findall("dap:Group", self._NS)) - - if ngroups == 0 and group != "/": - warnings.warn( - f"No groups in DMR++ file {self.data_filepath!r}; " - f"ignoring group parameter {group!r}" - ) - - group_path = Path("/") if ngroups == 0 else Path("/") / group.removeprefix("/") - - dataset_element = self.dmrparser()._split_groups(self.root).get(group_path) - - if dataset_element is None: - raise ValueError( - f"Group {group_path} not found in DMR++ file {self.data_filepath!r}" - ) - - # get two dictionary containing relevant metadata - vars_dict, attrs = self.dmrparser()._parse_dataset(dataset_element) - - manifest_dict: dict[str, ManifestArray] = {} - - for var in vars_dict.keys(): - chunkmanifest = vars_dict[var].pop("chunkmanifest", None) - # remove opendap-related metadata - meta = dict( - [ - (k, v) - for k, v in vars_dict[var].items() - if k not in ["Maps", "fqn_dims"] - ] - ) - if "_FillValue" in meta["attributes"]: - encoded_cf_fill_value = encode_cf_fill_value( - meta["attributes"]["_FillValue"], meta["data_type"] - ) - meta["attributes"]["_FillValue"] = encoded_cf_fill_value - - if "inline" in meta: - # extract data already decoded into array/string - data = meta.pop("inline", None) - bdata = base64.b64encode(data) - # chunk_entry = ChunkEntry( - # path="", offset=0, length=len(bdata), data=bdata - # ) - # chunkmanifest = ChunkManifest(entries=chunk_entry) - - chunks = { - "0.0": { - "path": "__inline__", - "offset": 0, - "length": len(bdata), - "data": bdata, - }, - } - chunkmanifest = ChunkManifest(entries=chunks) - else: - chunkmanifest = ChunkManifest(chunkmanifest) - - metadata = create_v3_array_metadata(**meta) - manifest_dict[var] = ManifestArray( - metadata=metadata, chunkmanifest=chunkmanifest - ) - - manifest_group = ManifestGroup(arrays=manifest_dict, attributes=attrs) - registry: ObjectStoreRegistry = ObjectStoreRegistry() - registry.register(self.data_filepath, object_store) - - return ManifestStore(registry=registry, group=manifest_group) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 026b7c66..78ff40a3 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -44,3 +44,5 @@ def _importorskip( has_arro3, requires_arro3 = _importorskip("arro3.core") # The GribberishParser is new in gribberish 1.0.0. has_grib, requires_grib = _importorskip("gribberish", minversion="1.0.0") +# The DMRPPParser is new in pydap 3.5.10 +has_pydap, requires_pydap = _importorskip("pydap") diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index 5e382197..7e2890a9 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -1,20 +1,18 @@ -import textwrap -from contextlib import nullcontext from xml.etree import ElementTree as ET import pytest import requests import xarray as xr -import xarray.testing as xrt from obspec_utils.registry import ObjectStoreRegistry -from packaging import version from virtualizarr.parsers import DMRPPParser, HDFParser -from virtualizarr.parsers.dmrpp import DMRParser -from virtualizarr.tests import requires_network, slow_test +from virtualizarr.tests import requires_network, requires_pydap, slow_test from virtualizarr.tests.utils import obstore_local, obstore_s3 from virtualizarr.xarray import open_virtual_dataset +pytest.importorskip("pydap") +from pydap.virtualizarr.parser import DMRParser # noqa: E402 + urls = [ ( "s3://its-live-data/test-space/cloud-experiments/dmrpp/20240826090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc", @@ -23,364 +21,8 @@ # TODO: later add MUR, SWOT, TEMPO and others by using kerchunk JSON to read refs (rather than reading the whole netcdf file) ] -# This DMRPP was created following the instructions from https://opendap.github.io/DMRpp-wiki/DMRpp.html#sec-build-them on the files produced by conftest.py with the key matching the fixture name. -DMRPP_XML_STRINGS = { - "netcdf4_file": textwrap.dedent( - """\ - - - - - - - - - NaN - - - latitude - - - Latitude - - - degrees_north - - - Y - - - - - - - - - NaN - - - longitude - - - Longitude - - - degrees_east - - - X - - - - - - - - - NaN - - - time - - - Time - - - hours since 1800-01-01 - - - standard - - - - - - - - - - - 4xDaily Air temperature at sigma level 995 - - - degK - - - 2 - - - 11 - - - TMP - - - Air temperature - - - NMC Reanalysis - - - Surface - - - Individual Obs - - - Other - - - 185.1600037 - 322.1000061 - - - 0.01 - - - - - - - - - - COARDS - - - 4x daily NMC reanalysis (1948) - - - Data is from NMC initialized reanalysis - (4x/day). These are the 0.9950 sigma level values. - - - Model - - - http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanalysis.html - - - - 2025-07-16T18:48:42Z - - - 3.21.1-451 - - - 3.21.1-451 - - - libdap-3.21.1-178 - - - build_dmrpp -f /usr/share/hyrax/air.nc -r air.nc.dmr -u OPeNDAP_DMRpp_DATA_ACCESS_URL -M - - - - """ - ), - "hdf5_groups_file": textwrap.dedent( - """\ - - - - - 2025-07-16T21:57:57Z - - - 3.21.1-451 - - - 3.21.1-451 - - - libdap-3.21.1-178 - - - build_dmrpp -f /usr/share/hyrax/hdf5_groups_file.nc -r hdf5_groups_file.nc.dmr -u OPeNDAP_DMRpp_DATA_ACCESS_URL -M - - - - - - - - - - - NaN - - - latitude - - - Latitude - - - degrees_north - - - Y - - - - - - - - - NaN - - - longitude - - - Longitude - - - degrees_east - - - X - - - - - - - - - NaN - - - time - - - Time - - - hours since 1800-01-01 - - - standard - - - - - - - - - - - 4xDaily Air temperature at sigma level 995 - - - degK - - - 2 - - - 11 - - - TMP - - - Air temperature - - - NMC Reanalysis - - - Surface - - - Individual Obs - - - Other - - - 185.1600037 - 322.1000061 - - - 0.01 - - - - - - - - - - COARDS - - - 4x daily NMC reanalysis (1948) - - - Data is from NMC initialized reanalysis - (4x/day). These are the 0.9950 sigma level values. - - - Model - - - http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanalysis.html - - - - - """ - ), - "fill_value_scalar_no_chunks_nc4_url": textwrap.dedent( - """\ - - - - - -999 - - - - - empty scalar data - - - - 2025-08-14T23:32:01Z - - - container attributes are no longer supported - - - - - 2025-08-14T23:32:01Z - - - 3.21.1-477 - - - 3.21.1-477 - - - libdap-3.21.1-222 - - - build_dmrpp -f /usr/share/hyrax/fill_value_scalar_no_chunks.nc4 -r fill_value_scalar_no_chunks.nc4.dmr -u OPeNDAP_DMRpp_DATA_ACCESS_URL -M - - - - """ - ), -} - +@requires_pydap def dmrparser(dmrpp_xml_str: str, filepath: str) -> DMRParser: # TODO we should actually create a dmrpp file in a temporary directory # this would avoid the need to pass tmp_path separately @@ -434,6 +76,7 @@ def test_NASA_dmrpp_load(data_url, dmrpp_url): assert ds.load() +<<<<<<< HEAD @pytest.mark.xfail( reason="See https://github.com/zarr-developers/VirtualiZarr/issues/904.", ) @@ -537,6 +180,9 @@ def test_parse_dataset_nested(hdf5_groups_file): ======= >>>>>>> 79b9a79 (rebase) +======= +@requires_pydap +>>>>>>> 5fa3d8f (update dmrpp migration - soft imports and code refactor) def test_dmrpp_simple(dmrpp_xml_simple): """Test parsing a simple valid DMR++ XML creates virtual chunk manifests.""" parser = dmrparser(dmrpp_xml_simple, filepath="file:///simple.nc") @@ -572,6 +218,7 @@ def test_dmrpp_simple(dmrpp_xml_simple): assert isinstance(chunk_info["length"], int) +@requires_pydap def test_dmrpp_missing_attrib_validation(dmrpp_xml_with_missing_attrib): """Test that validation issues are accumulated for missing attributes.""" parser = dmrparser( @@ -598,6 +245,7 @@ def test_dmrpp_missing_attrib_validation(dmrpp_xml_with_missing_attrib): @requires_network +@requires_pydap def test_inlinevalue(): """ Test that inline values can be parsed into manifest From d3c89e9f0a1c24a33bbde47101476142755cef54 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 12 Jun 2026 16:45:12 -0700 Subject: [PATCH 19/22] add dmrpp testing to py312 and py313 --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7567ff4c..a7c6c652 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -220,8 +220,8 @@ run-flaky-tests = { cmd = "pytest -m flaky --run-flaky --run-network-tests --ver min-deps = ["dev", "test", "hdf", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs # Inherit from min-deps to get all the test commands, along with optional dependencies test = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "zarr", "py314"] -test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "grib", "zarr", "py312"] # test against python 3.13 -test-py313 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "grib", "zarr", "py313"] # test against python 3.13 +test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "grib", "dmrpp", "zarr", "py312"] # test against python 3.13 +test-py313 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "grib", "dmrpp", "zarr", "py313"] # test against python 3.13 test-py314 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "zarr", "py314"] # test against python 3.14 minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "py314", "zarr", "minio"] # tiff (virtual-tiff) is excluded: virtual-tiff 0.5.0 imports zarr's CodecJSON From cf4cefa95dcdf6d273f1fa373a9bc88a0ca156df Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Mon, 22 Jun 2026 22:11:04 -0700 Subject: [PATCH 20/22] migrate dmrpp reading to pydap --- virtualizarr/parsers/dmrpp.py | 20 +++---------------- virtualizarr/tests/test_parsers/test_dmrpp.py | 6 +++--- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/virtualizarr/parsers/dmrpp.py b/virtualizarr/parsers/dmrpp.py index 29e183b9..ad83a4a4 100644 --- a/virtualizarr/parsers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -1,9 +1,5 @@ -import io from typing import Iterable -from xml.etree import ElementTree as ET -# from obspec_utils.protocols import ReadableStore -from obspec_utils.readers import EagerStoreReader from obspec_utils.registry import ObjectStoreRegistry from virtualizarr.manifests import ManifestStore @@ -52,23 +48,13 @@ def __call__( ManifestStore A ManifestStore that provides a Zarr representation of the data source referenced by the DMR++ file. """ - store, path_in_store = registry.resolve(url) - reader = EagerStoreReader(store=store, path=path_in_store) - file_bytes = reader.readall() - stream = io.BytesIO(file_bytes) - - url = ( - url.removesuffix(".dap.dmrpp") - if url.endswith(".dap.dmrpp") - else url.removesuffix(".dmrpp") - ) from pydap.virtualizarr.parser import DMRParser parser = DMRParser( - root=ET.parse(stream).getroot(), - data_filepath=url, + url=url, + object_store=registry, skip_variables=self.skip_variables, ) - manifest_store = parser.parse_dataset(object_store=store, group=self.group) + manifest_store = parser.parse_dataset(group=self.group) return manifest_store diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index 7e2890a9..a684f7a2 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -1,4 +1,4 @@ -from xml.etree import ElementTree as ET +# from xml.etree import ElementTree as ET import pytest import requests @@ -23,11 +23,11 @@ @requires_pydap -def dmrparser(dmrpp_xml_str: str, filepath: str) -> DMRParser: +def dmrparser(url: str, filepath: str) -> DMRParser: # TODO we should actually create a dmrpp file in a temporary directory # this would avoid the need to pass tmp_path separately - return DMRParser(root=ET.fromstring(dmrpp_xml_str), data_filepath=filepath) + return DMRParser(url=url, data_filepath=filepath) @slow_test From 86d8f6282ebef888e2baf6aa35630a27625a161a Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Tue, 23 Jun 2026 22:05:04 -0700 Subject: [PATCH 21/22] rebase + update tests --- virtualizarr/tests/test_parsers/test_dmrpp.py | 142 ++---------------- 1 file changed, 14 insertions(+), 128 deletions(-) diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index a684f7a2..fd90b042 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -1,5 +1,3 @@ -# from xml.etree import ElementTree as ET - import pytest import requests import xarray as xr @@ -22,16 +20,9 @@ ] -@requires_pydap -def dmrparser(url: str, filepath: str) -> DMRParser: - # TODO we should actually create a dmrpp file in a temporary directory - # this would avoid the need to pass tmp_path separately - - return DMRParser(url=url, data_filepath=filepath) - - @slow_test @requires_network +@requires_pydap @pytest.mark.parametrize("data_url, dmrpp_url", urls) def test_NASA_dmrpp(data_url, dmrpp_url): store = obstore_s3( @@ -58,6 +49,7 @@ def test_NASA_dmrpp(data_url, dmrpp_url): @requires_network +@requires_pydap @slow_test @pytest.mark.parametrize("data_url, dmrpp_url", urls) def test_NASA_dmrpp_load(data_url, dmrpp_url): @@ -76,121 +68,15 @@ def test_NASA_dmrpp_load(data_url, dmrpp_url): assert ds.load() -<<<<<<< HEAD -@pytest.mark.xfail( - reason="See https://github.com/zarr-developers/VirtualiZarr/issues/904.", -) -@pytest.mark.parametrize( - "group,warns", - [ - pytest.param(None, False, id="None"), - pytest.param("/", False, id="/"), - pytest.param("/no-such-group", True, id="/no-such-group"), - ], -) -def test_parse_dataset(group: str | None, warns: bool, netcdf4_file): - drmpp = dmrparser( - DMRPP_XML_STRINGS["netcdf4_file"], filepath=f"file://{netcdf4_file}" - ) - store = obstore_local(url=drmpp.data_filepath) - - with nullcontext() if warns else pytest.raises(BaseException, match="DID NOT WARN"): - with pytest.warns(UserWarning, match=f"ignoring group parameter {group!r}"): - ms = drmpp.parse_dataset(object_store=store, group=group) - - vds = ms.to_virtual_dataset() - - assert vds.sizes == {"lat": 25, "lon": 53, "time": 2920} - assert vds.data_vars.keys() == {"air"} - assert vds.data_vars["air"].dims == ("time", "lat", "lon") - assert vds.attrs == { - "Conventions": "COARDS", - "title": "4x daily NMC reanalysis (1948)", - "description": "Data is from NMC initialized reanalysis\n(4x/day). These are the 0.9950 sigma level values.", - "platform": "Model", - "references": "http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanalysis.html", - } - assert vds.coords.keys() == {"lat", "lon", "time"} - - -@pytest.mark.xfail( - version.parse(xr.__version__) < version.parse("2025.7.1"), - reason="Offsets in file changed", -) -def test_parse_dataset_nested(hdf5_groups_file): - nested_groups_dmrpp = dmrparser( - DMRPP_XML_STRINGS["hdf5_groups_file"], filepath=f"file://{hdf5_groups_file}" - ) - store = obstore_local(url=f"file://{nested_groups_dmrpp.data_filepath}") - - vds_root_implicit = nested_groups_dmrpp.parse_dataset( - object_store=store - ).to_virtual_dataset(loadable_variables=[]) - vds_root = nested_groups_dmrpp.parse_dataset( - group="/", object_store=store - ).to_virtual_dataset(loadable_variables=[]) - - xrt.assert_identical(vds_root_implicit, vds_root) - assert vds_root.sizes == {} - - vds_g1 = nested_groups_dmrpp.parse_dataset( - group="/test", object_store=store - ).to_virtual_dataset(loadable_variables=[]) - assert vds_g1.sizes == {} - - vds_g2 = nested_groups_dmrpp.parse_dataset( - group="/test/group", object_store=store - ).to_virtual_dataset(loadable_variables=[]) - - assert vds_g2.sizes == {"time": 2920, "lat": 25, "lon": 53} - assert vds_g2.data_vars.keys() == {"air"} - assert vds_g2.data_vars["air"].dims == ("time", "lat", "lon") - - -<<<<<<< HEAD -# def test_parse_variable(netcdf4_file): -# parser = dmrparser(DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file) - - # var = parser._parse_variable(parser.find_node_fqn("/air")) - # assert var.metadata.dtype.to_native_dtype() == "int16" - # assert var.metadata.dimension_names == ("time", "lat", "lon") - # assert var.shape == (2920, 25, 53) - # assert var.metadata.chunks == (2920, 25, 53) - # # _FillValue is encoded for array dtype - # assert var.metadata.attributes["scale_factor"] == 0.01 - # assert ( - # var.metadata.attributes["long_name"] - # == "4xDaily Air temperature at sigma level 995" - # ) - - -# @pytest.mark.parametrize( -# "attr_path, expected", -# [ -# ("air/long_name", {"long_name": "4xDaily Air temperature at sigma level 995"}), -# ("air/scale_factor", {"scale_factor": 0.01}), -# ], -# ) -# def test_parse_attribute(netcdf4_file, attr_path, expected): -# parser = dmrparser(DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file) - -# result = parser._parse_attribute(parser.find_node_fqn(attr_path)) -# assert result == expected - - -======= ->>>>>>> 79b9a79 (rebase) -======= @requires_pydap ->>>>>>> 5fa3d8f (update dmrpp migration - soft imports and code refactor) def test_dmrpp_simple(dmrpp_xml_simple): """Test parsing a simple valid DMR++ XML creates virtual chunk manifests.""" - parser = dmrparser(dmrpp_xml_simple, filepath="file:///simple.nc") + parser = DMRParser( + dmrpp_xml_simple, object_store=obstore_local(url="file:///simple.nc") + ) # Parse dataset - manifest_store = parser.parse_dataset( - object_store=obstore_local(url="file:///"), group="/" - ) + manifest_store = parser.parse_dataset(group="/") # Verify manifest store is created assert manifest_store is not None @@ -221,14 +107,13 @@ def test_dmrpp_simple(dmrpp_xml_simple): @requires_pydap def test_dmrpp_missing_attrib_validation(dmrpp_xml_with_missing_attrib): """Test that validation issues are accumulated for missing attributes.""" - parser = dmrparser( - dmrpp_xml_with_missing_attrib, filepath="file:///validation_test.nc" + parser = DMRParser( + dmrpp_xml_with_missing_attrib, + object_store=obstore_local(url="file:///validation_test.nc"), ) # Parse dataset - this should accumulate validation issues - manifest_store = parser.parse_dataset( - object_store=obstore_local(url="file:///"), group="/" - ) + manifest_store = parser.parse_dataset(group="/") # Verify that validation issues were accumulated assert len(parser._validation_issues) > 0 @@ -257,9 +142,10 @@ def test_inlinevalue(): ) session = requests.Session() dmrpp = session.get(dmrpp_file).content.decode() - parser = dmrparser(dmrpp, filepath="file:///") - store = obstore_local(url=parser.data_filepath) - ms = parser.parse_dataset(object_store=store) + parser = DMRParser( + dmrpp, object_store=obstore_local(url="file:///compact_lowlevel.h5") + ) + ms = parser.parse_dataset() vds = ms.to_virtual_dataset() assert "my_dataset" in vds assert ms._group["my_dataset"]._manifest._inlined == {(0, 0): expected_bytes} From 75cab1c77ddd06d7ffbc7f1180b482624d2db2d8 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Tue, 23 Jun 2026 22:14:51 -0700 Subject: [PATCH 22/22] document the change --- docs/releases.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/releases.md b/docs/releases.md index ecc99096..32ac6f22 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -33,6 +33,9 @@ ### Internal changes +- `DMRPPParser` is now migrated to [pydap](https://github.com/pydap/pydap) (`pydap>=3.5.10`) (see [pydap#417](https://github.com/pydap/pydap/issues/417)). To use the dmrpp parser one must install the extra `pip install "virtualizarr[dmrpp]"`. + By [Miguel Jimenez-Urias](https://github.com/Mikejmnez) + - Speed up virtual chunk container validation when writing to Icechunk by passing the supported prefixes as a tuple to `str.startswith`, which runs the loop over prefixes in C rather than in a Python generator. The per-reference check is now ~2.6x faster in the common single-container case, which matters when writing manifests with millions of virtual references. The check is still a per-reference Python loop overall (see [icechunk#1167](https://github.com/earth-mover/icechunk/issues/1167) for pushing it down to Icechunk entirely). By [Tom Nicholas](https://github.com/TomNicholas).