From 6476ff87e6a8ff8fb5c156146df261ea8a7f8bd0 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 6 Mar 2025 16:38:07 +0100 Subject: [PATCH 1/7] Refactor iterators used in ids2nc and nc2ids --- imas/backends/netcdf/ids2nc.py | 31 +--------- imas/backends/netcdf/iterators.py | 98 +++++++++++++++++++++++++++++++ imas/backends/netcdf/nc2ids.py | 58 +++--------------- 3 files changed, 108 insertions(+), 79 deletions(-) create mode 100644 imas/backends/netcdf/iterators.py diff --git a/imas/backends/netcdf/ids2nc.py b/imas/backends/netcdf/ids2nc.py index 2b892838..d5a8816c 100644 --- a/imas/backends/netcdf/ids2nc.py +++ b/imas/backends/netcdf/ids2nc.py @@ -1,21 +1,16 @@ # This file is part of IMAS-Python. # You should have received the IMAS-Python LICENSE file with this project. -"""NetCDF IO support for IMAS-Python. Requires [netcdf] extra dependencies. -""" - -from typing import Iterator, Tuple +"""NetCDF IO support for IMAS-Python. Requires [netcdf] extra dependencies.""" import netCDF4 import numpy from packaging import version from imas.backends.netcdf.nc_metadata import NCMetadata +from imas.backends.netcdf.iterators import indexed_tree_iter from imas.exception import InvalidNetCDFEntry -from imas.ids_base import IDSBase from imas.ids_data_type import IDSDataType from imas.ids_defs import IDS_TIME_MODE_HOMOGENEOUS -from imas.ids_struct_array import IDSStructArray -from imas.ids_structure import IDSStructure from imas.ids_toplevel import IDSToplevel default_fillvals = { @@ -33,26 +28,6 @@ SHAPE_DTYPE = numpy.int32 -def nc_tree_iter( - node: IDSStructure, aos_index: Tuple[int, ...] = () -) -> Iterator[Tuple[Tuple[int, ...], IDSBase]]: - """Tree iterator that tracks indices of all ancestor array of structures. - - Args: - node: IDS node to iterate over - - Yields: - (aos_index, node) for all filled nodes. - """ - for child in node.iter_nonempty_(): - yield (aos_index, child) - if isinstance(child, IDSStructArray): - for i in range(len(child)): - yield from nc_tree_iter(child[i], aos_index + (i,)) - elif isinstance(child, IDSStructure): - yield from nc_tree_iter(child, aos_index) - - class IDS2NC: """Class responsible for storing an IDS to a NetCDF file.""" @@ -105,7 +80,7 @@ def collect_filled_data(self) -> None: dimension_size = {} get_dimensions = self.ncmeta.get_dimensions - for aos_index, node in nc_tree_iter(self.ids): + for aos_index, node in indexed_tree_iter(self.ids): path = node.metadata.path_string filled_data[path][aos_index] = node ndim = node.metadata.ndim diff --git a/imas/backends/netcdf/iterators.py b/imas/backends/netcdf/iterators.py new file mode 100644 index 00000000..bff40e43 --- /dev/null +++ b/imas/backends/netcdf/iterators.py @@ -0,0 +1,98 @@ +from typing import Iterator, List, Optional, Tuple + +from imas.ids_base import IDSBase +from imas.ids_data_type import IDSDataType +from imas.ids_metadata import IDSMetadata +from imas.ids_struct_array import IDSStructArray +from imas.ids_structure import IDSStructure +from imas.ids_toplevel import IDSToplevel + + +def _split_on_aos(metadata: IDSMetadata): + """Split paths per IDS.""" + paths = [] + curpath = metadata.name + + item = metadata + while item._parent.data_type is not None: + item = item._parent + if item.data_type is IDSDataType.STRUCT_ARRAY: + paths.append(curpath) + curpath = item.name + else: + curpath = f"{item.name}/{curpath}" + paths.append(curpath) + return paths[::-1] + + +IndexedNode = Tuple[Tuple[int, ...], IDSBase] + + +def indexed_tree_iter( + ids: IDSToplevel, metadata: Optional[IDSMetadata] = None +) -> Iterator[IndexedNode]: + """Tree iterator that tracks indices of all ancestor array of structures. + + Args: + ids: IDS top level element to iterate over + metadata: Iterate over all nodes inside the IDS at the metadata object. + If ``None``, all filled items in the IDS are iterated over. + + Yields: + (aos_indices, node) for all filled nodes. + + Example: + >>> ids = imas.IDSFactory().new("core_profiles") + >>> ids.profiles_1d.resize(2) + >>> ids.profiles_1d[0].time = 1.0 + >>> ids.profiles_1d[1].t_i_average = [1.0] + >>> list(indexed_tree_iter(ids)) + [ + ((), ), + ((0,), ), + ((1,), ) + ] + >>> list(indexed_tree_iter(ids, ids.metadata["profiles_1d/time"])) + [ + ((0,), ), + ((1,), ) + ] + """ # noqa: E501 + if metadata is None: + # Iterate over all filled nodes in the IDS + yield from _full_tree_iter(ids, ()) + + else: + paths = _split_on_aos(metadata) + if len(paths) == 1: + yield (), ids[paths[0]] + else: + yield from _tree_iter(ids, paths, ()) + + +def _tree_iter( + structure: IDSStructure, paths: List[str], curindex: Tuple[int, ...] +) -> Iterator[IndexedNode]: + aos_path, *paths = paths + aos = structure[aos_path] + + if len(paths) == 1: + path = paths[0] + for i, node in enumerate(aos): + yield curindex + (i,), node[path] + + else: + for i, node in enumerate(aos): + yield from _tree_iter(node, paths, curindex + (i,)) + + +def _full_tree_iter( + node: IDSStructure, cur_index: Tuple[int, ...] +) -> Iterator[IndexedNode]: + for child in node.iter_nonempty_(): + yield (cur_index, child) + if isinstance(child, IDSStructArray): + for i in range(len(child)): + yield from _full_tree_iter(child[i], cur_index + (i,)) + elif isinstance(child, IDSStructure): + yield from _full_tree_iter(child, cur_index) diff --git a/imas/backends/netcdf/nc2ids.py b/imas/backends/netcdf/nc2ids.py index 7829d257..e9b524fb 100644 --- a/imas/backends/netcdf/nc2ids.py +++ b/imas/backends/netcdf/nc2ids.py @@ -1,19 +1,18 @@ import logging import os -from typing import Iterator, List, Optional, Tuple +from typing import Optional import netCDF4 import numpy as np from imas.backends.netcdf import ids2nc from imas.backends.netcdf.nc_metadata import NCMetadata +from imas.backends.netcdf.iterators import indexed_tree_iter from imas.exception import InvalidNetCDFEntry -from imas.ids_base import IDSBase from imas.ids_convert import NBCPathMap from imas.ids_data_type import IDSDataType from imas.ids_defs import IDS_TIME_MODE_HOMOGENEOUS from imas.ids_metadata import IDSMetadata -from imas.ids_structure import IDSStructure from imas.ids_toplevel import IDSToplevel logger = logging.getLogger(__name__) @@ -26,49 +25,6 @@ def variable_error(var, issue, value, expected=None) -> InvalidNetCDFEntry: ) -def split_on_aos(metadata: IDSMetadata): - paths = [] - curpath = metadata.name - - item = metadata - while item._parent.data_type is not None: - item = item._parent - if item.data_type is IDSDataType.STRUCT_ARRAY: - paths.append(curpath) - curpath = item.name - else: - curpath = f"{item.name}/{curpath}" - paths.append(curpath) - return paths[::-1] - - -IndexedNode = Tuple[Tuple[int, ...], IDSBase] - - -def tree_iter(structure: IDSStructure, metadata: IDSMetadata) -> Iterator[IndexedNode]: - paths = split_on_aos(metadata) - if len(paths) == 1: - yield (), structure[paths[0]] - else: - yield from _tree_iter(structure, paths, ()) - - -def _tree_iter( - structure: IDSStructure, paths: List[str], curindex: Tuple[int, ...] -) -> Iterator[IndexedNode]: - aos_path, *paths = paths - aos = structure[aos_path] - - if len(paths) == 1: - path = paths[0] - for i, node in enumerate(aos): - yield curindex + (i,), node[path] - - else: - for i, node in enumerate(aos): - yield from _tree_iter(node, paths, curindex + (i,)) - - class NC2IDS: """Class responsible for reading an IDS from a NetCDF group.""" @@ -169,7 +125,7 @@ def run(self, lazy: bool) -> None: if metadata.data_type is IDSDataType.STRUCT_ARRAY: if "sparse" in var.ncattrs(): shapes = self.group[var_name + ":shape"][()] - for index, node in tree_iter(self.ids, target_metadata): + for index, node in indexed_tree_iter(self.ids, target_metadata): node.resize(shapes[index][0]) else: @@ -178,7 +134,7 @@ def run(self, lazy: bool) -> None: metadata.path_string, self.homogeneous_time )[-1] size = self.group.dimensions[dim].size - for _, node in tree_iter(self.ids, target_metadata): + for _, node in indexed_tree_iter(self.ids, target_metadata): node.resize(size) continue @@ -190,7 +146,7 @@ def run(self, lazy: bool) -> None: if "sparse" in var.ncattrs(): if metadata.ndim: shapes = self.group[var_name + ":shape"][()] - for index, node in tree_iter(self.ids, target_metadata): + for index, node in indexed_tree_iter(self.ids, target_metadata): shape = shapes[index] if shape.all(): # NOTE: bypassing IDSPrimitive.value.setter logic @@ -198,7 +154,7 @@ def run(self, lazy: bool) -> None: index + tuple(map(slice, shape)) ] else: - for index, node in tree_iter(self.ids, target_metadata): + for index, node in indexed_tree_iter(self.ids, target_metadata): value = data[index] if value != getattr(var, "_FillValue", None): # NOTE: bypassing IDSPrimitive.value.setter logic @@ -211,7 +167,7 @@ def run(self, lazy: bool) -> None: self.ids[target_metadata.path].value = data else: - for index, node in tree_iter(self.ids, target_metadata): + for index, node in indexed_tree_iter(self.ids, target_metadata): # NOTE: bypassing IDSPrimitive.value.setter logic node._IDSPrimitive__value = data[index] From 815729c436fc51b660c5b7256b8efc12a74c6dd4 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 7 Mar 2025 11:14:18 +0100 Subject: [PATCH 2/7] Refactor ids2nc, extract common tensorization logic in IDSTensorizer Allows reuse of functionality to (partially) convert IDSs to xarray Datasets. --- imas/backends/netcdf/ids2nc.py | 138 +------------------- imas/backends/netcdf/ids_tensorizer.py | 173 +++++++++++++++++++++++++ 2 files changed, 177 insertions(+), 134 deletions(-) create mode 100644 imas/backends/netcdf/ids_tensorizer.py diff --git a/imas/backends/netcdf/ids2nc.py b/imas/backends/netcdf/ids2nc.py index d5a8816c..531c7ac2 100644 --- a/imas/backends/netcdf/ids2nc.py +++ b/imas/backends/netcdf/ids2nc.py @@ -3,14 +3,11 @@ """NetCDF IO support for IMAS-Python. Requires [netcdf] extra dependencies.""" import netCDF4 -import numpy from packaging import version -from imas.backends.netcdf.nc_metadata import NCMetadata -from imas.backends.netcdf.iterators import indexed_tree_iter +from imas.backends.netcdf.ids_tensorizer import SHAPE_DTYPE, IDSTensorizer, dtypes from imas.exception import InvalidNetCDFEntry from imas.ids_data_type import IDSDataType -from imas.ids_defs import IDS_TIME_MODE_HOMOGENEOUS from imas.ids_toplevel import IDSToplevel default_fillvals = { @@ -19,16 +16,9 @@ IDSDataType.FLT: netCDF4.default_fillvals["f8"], IDSDataType.CPX: netCDF4.default_fillvals["f8"] * (1 + 1j), } -dtypes = { - IDSDataType.INT: numpy.dtype(numpy.int32), - IDSDataType.STR: str, - IDSDataType.FLT: numpy.dtype(numpy.float64), - IDSDataType.CPX: numpy.dtype(numpy.complex128), -} -SHAPE_DTYPE = numpy.int32 -class IDS2NC: +class IDS2NC(IDSTensorizer): """Class responsible for storing an IDS to a NetCDF file.""" def __init__(self, ids: IDSToplevel, group: netCDF4.Group) -> None: @@ -38,112 +28,18 @@ def __init__(self, ids: IDSToplevel, group: netCDF4.Group) -> None: ids: IDSToplevel to store in the netCDF group group: Empty netCDF group to store the IDS in. """ - self.ids = ids - """IDS to store.""" + super().__init__(ids, []) # pass empty list: tensorize full IDS self.group = group """NetCDF Group to store the IDS in.""" - self.ncmeta = NCMetadata(ids.metadata) - """NetCDF related metadata.""" - self.dimension_size = {} - """Map dimension name to its size.""" - self.filled_data = {} - """Map of IDS paths to filled data nodes.""" - self.filled_variables = set() - """Set of filled IDS variables""" - self.homogeneous_time = ( - ids.ids_properties.homogeneous_time == IDS_TIME_MODE_HOMOGENEOUS - ) - """True iff the IDS time mode is homogeneous.""" - self.shapes = {} - """Map of IDS paths to data shape arrays.""" - def run(self) -> None: """Store the IDS in the NetCDF group.""" self.collect_filled_data() self.determine_data_shapes() self.create_dimensions() self.create_variables() - # Synchronize variables to disk - # This is not strictly required (automatically done by netCDF4 when needed), but - # by separating it we get more meaningful profiling statistics - self.group.sync() self.store_data() - def collect_filled_data(self) -> None: - """Collect all filled data in the IDS and determine dimension sizes. - - Results are stored in :attr:`filled_data` and :attr:`dimension_size`. - """ - # Initialize dictionary with all paths that could exist in this IDS - filled_data = {path: {} for path in self.ncmeta.paths} - dimension_size = {} - get_dimensions = self.ncmeta.get_dimensions - - for aos_index, node in indexed_tree_iter(self.ids): - path = node.metadata.path_string - filled_data[path][aos_index] = node - ndim = node.metadata.ndim - if not ndim: - continue - dimensions = get_dimensions(path, self.homogeneous_time) - # We're only interested in the non-tensorized dimensions: [-ndim:] - for dim_name, size in zip(dimensions[-ndim:], node.shape): - dimension_size[dim_name] = max(dimension_size.get(dim_name, 0), size) - - # Remove paths without data - self.filled_data = {path: data for path, data in filled_data.items() if data} - self.filled_variables = {path.replace("/", ".") for path in self.filled_data} - # Store dimension sizes - self.dimension_size = dimension_size - - def determine_data_shapes(self) -> None: - """Determine tensorized data shapes and sparsity, save in :attr:`shapes`.""" - get_dimensions = self.ncmeta.get_dimensions - - for path, nodes_dict in self.filled_data.items(): - metadata = self.ids.metadata[path] - # Structures don't have a size - if metadata.data_type is IDSDataType.STRUCTURE: - continue - ndim = metadata.ndim - dimensions = get_dimensions(path, self.homogeneous_time) - - # node shape if it is completely filled - full_shape = tuple(self.dimension_size[dim] for dim in dimensions[-ndim:]) - - if len(dimensions) == ndim: - # Data at this path is not tensorized - node = nodes_dict[()] - sparse = node.shape != full_shape - if sparse: - shapes = numpy.array(node.shape, dtype=SHAPE_DTYPE) - - else: - # Data is tensorized, determine if it is homogeneously shaped - aos_dims = get_dimensions(self.ncmeta.aos[path], self.homogeneous_time) - shapes_shape = [self.dimension_size[dim] for dim in aos_dims] - if ndim: - shapes_shape.append(ndim) - shapes = numpy.zeros(shapes_shape, dtype=SHAPE_DTYPE) - - if ndim: # ND types have a shape - for aos_coords, node in nodes_dict.items(): - shapes[aos_coords] = node.shape - sparse = not numpy.array_equiv(shapes, full_shape) - - else: # 0D types don't have a shape - for aos_coords in nodes_dict.keys(): - shapes[aos_coords] = 1 - sparse = not shapes.all() - shapes = None - - if sparse: - self.shapes[path] = shapes - if ndim: - # Ensure there is a pseudo-dimension f"{ndim}D" for shapes variable - self.dimension_size[f"{ndim}D"] = ndim - def create_dimensions(self) -> None: """Create netCDF dimensions.""" for dimension, size in self.dimension_size.items(): @@ -228,14 +124,6 @@ def create_variables(self) -> None: "shape is unset (i.e. filled with _Fillvalue)." ) - def filter_coordinates(self, path: str) -> str: - """Filter the coordinates list from NCMetadata to filled variables only.""" - return " ".join( - coordinate - for coordinate in self.ncmeta.get_coordinates(path, self.homogeneous_time) - if coordinate in self.filled_variables - ) - def store_data(self) -> None: """Store data in the netCDF variables""" for path, nodes_dict in self.filled_data.items(): @@ -273,22 +161,4 @@ def store_data(self) -> None: else: # Data is tensorized: tensorize in-memory - # TODO: depending on the data, tmp_var may be HUGE, we may need a more - # efficient assignment algorithm for large and/or irregular data - tmp_var = numpy.full(var.shape, default_fillvals[metadata.data_type]) - if metadata.data_type is IDSDataType.STR: - tmp_var = numpy.asarray(tmp_var, dtype=object) - - # Fill tmp_var - if shapes is None: - # Data is not sparse, so we can assign to the aos_coords - for aos_coords, node in nodes_dict.items(): - tmp_var[aos_coords] = node.value - else: - # Data is sparse, so we must select a slice - for aos_coords, node in nodes_dict.items(): - tmp_var[aos_coords + tuple(map(slice, node.shape))] = node.value - - # Assign data to variable - var[()] = tmp_var - del tmp_var + var[()] = self.tensorize(path, default_fillvals[metadata.data_type]) diff --git a/imas/backends/netcdf/ids_tensorizer.py b/imas/backends/netcdf/ids_tensorizer.py new file mode 100644 index 00000000..4619919b --- /dev/null +++ b/imas/backends/netcdf/ids_tensorizer.py @@ -0,0 +1,173 @@ +# This file is part of IMAS-Python. +# You should have received the IMAS-Python LICENSE file with this project. +"""Tensorization logic to convert IDSs to netCDF files and/or xarray Datasets.""" + +from typing import List + +import numpy + +from imas.backends.netcdf.iterators import indexed_tree_iter +from imas.backends.netcdf.nc_metadata import NCMetadata +from imas.ids_data_type import IDSDataType +from imas.ids_defs import IDS_TIME_MODE_HOMOGENEOUS +from imas.ids_toplevel import IDSToplevel + +dtypes = { + IDSDataType.INT: numpy.dtype(numpy.int32), + IDSDataType.STR: str, + IDSDataType.FLT: numpy.dtype(numpy.float64), + IDSDataType.CPX: numpy.dtype(numpy.complex128), +} +SHAPE_DTYPE = numpy.int32 + + +class IDSTensorizer: + """Common functionality for tensorizing IDSs. Used in IDS2NC and util.to_xarray.""" + + def __init__(self, ids: IDSToplevel, paths_to_tensorize: List[str]) -> None: + """Initialize IDSTensorizer. + + Args: + ids: IDSToplevel to store in the netCDF group + paths_to_tensorize: Restrict tensorization to the provided paths. If an + empty list is provided, all filled quantities in the IDS will be + tensorized. + """ + self.ids = ids + """IDS to tensorize.""" + self.paths_to_tensorize = paths_to_tensorize + """List of paths to tensorize""" + + self.ncmeta = NCMetadata(ids.metadata) + """NetCDF related metadata.""" + self.dimension_size = {} + """Map dimension name to its size.""" + self.filled_data = {} + """Map of IDS paths to filled data nodes.""" + self.filled_variables = set() + """Set of filled IDS variables""" + self.homogeneous_time = ( + ids.ids_properties.homogeneous_time == IDS_TIME_MODE_HOMOGENEOUS + ) + """True iff the IDS time mode is homogeneous.""" + self.shapes = {} + """Map of IDS paths to data shape arrays.""" + + def collect_filled_data(self) -> None: + """Collect all filled data in the IDS and determine dimension sizes. + + Results are stored in :attr:`filled_data` and :attr:`dimension_size`. + """ + # Initialize dictionary with all paths that could exist in this IDS + filled_data = {path: {} for path in self.ncmeta.paths} + dimension_size = {} + get_dimensions = self.ncmeta.get_dimensions + + if self.paths_to_tensorize: + # Restrict tensorization to provided paths + iterator = ( + item + for path in self.paths_to_tensorize + for item in indexed_tree_iter(self.ids, self.ids.metadata[path]) + if item[1].has_value # Skip nodes without value set + ) + else: + # Tensorize all non-empty nodes + iterator = indexed_tree_iter(self.ids) + + for aos_index, node in iterator: + path = node.metadata.path_string + filled_data[path][aos_index] = node + ndim = node.metadata.ndim + if not ndim: + continue + dimensions = get_dimensions(path, self.homogeneous_time) + # We're only interested in the non-tensorized dimensions: [-ndim:] + for dim_name, size in zip(dimensions[-ndim:], node.shape): + dimension_size[dim_name] = max(dimension_size.get(dim_name, 0), size) + + # Remove paths without data + self.filled_data = {path: data for path, data in filled_data.items() if data} + self.filled_variables = {path.replace("/", ".") for path in self.filled_data} + # Store dimension sizes + self.dimension_size = dimension_size + + def determine_data_shapes(self) -> None: + """Determine tensorized data shapes and sparsity, save in :attr:`shapes`.""" + get_dimensions = self.ncmeta.get_dimensions + + for path, nodes_dict in self.filled_data.items(): + metadata = self.ids.metadata[path] + # Structures don't have a size + if metadata.data_type is IDSDataType.STRUCTURE: + continue + ndim = metadata.ndim + dimensions = get_dimensions(path, self.homogeneous_time) + + # node shape if it is completely filled + full_shape = tuple(self.dimension_size[dim] for dim in dimensions[-ndim:]) + + if len(dimensions) == ndim: + # Data at this path is not tensorized + node = nodes_dict[()] + sparse = node.shape != full_shape + if sparse: + shapes = numpy.array(node.shape, dtype=SHAPE_DTYPE) + + else: + # Data is tensorized, determine if it is homogeneously shaped + aos_dims = get_dimensions(self.ncmeta.aos[path], self.homogeneous_time) + shapes_shape = [self.dimension_size[dim] for dim in aos_dims] + if ndim: + shapes_shape.append(ndim) + shapes = numpy.zeros(shapes_shape, dtype=SHAPE_DTYPE) + + if ndim: # ND types have a shape + for aos_coords, node in nodes_dict.items(): + shapes[aos_coords] = node.shape + sparse = not numpy.array_equiv(shapes, full_shape) + + else: # 0D types don't have a shape + for aos_coords in nodes_dict.keys(): + shapes[aos_coords] = 1 + sparse = not shapes.all() + shapes = None + + if sparse: + self.shapes[path] = shapes + if ndim: + # Ensure there is a pseudo-dimension f"{ndim}D" for shapes variable + self.dimension_size[f"{ndim}D"] = ndim + + def filter_coordinates(self, path: str) -> str: + """Filter the coordinates list from NCMetadata to filled variables only.""" + return " ".join( + coordinate + for coordinate in self.ncmeta.get_coordinates(path, self.homogeneous_time) + if coordinate in self.filled_variables + ) + + def tensorize(self, path, fillvalue): + dimensions = self.ncmeta.get_dimensions(path, self.homogeneous_time) + shape = tuple(self.dimension_size[dim] for dim in dimensions) + + # TODO: depending on the data, tmp_var may be HUGE, we may need a more + # efficient assignment algorithm for large and/or irregular data + tmp_var = numpy.full(shape, fillvalue) + if isinstance(fillvalue, str): + tmp_var = numpy.asarray(tmp_var, dtype=object) + + shapes = self.shapes.get(path) + nodes_dict = self.filled_data[path] + + # Fill tmp_var + if shapes is None: + # Data is not sparse, so we can assign to the aos_coords + for aos_coords, node in nodes_dict.items(): + tmp_var[aos_coords] = node.value + else: + # Data is sparse, so we must select a slice + for aos_coords, node in nodes_dict.items(): + tmp_var[aos_coords + tuple(map(slice, node.shape))] = node.value + + return tmp_var From 76c265a817f07456501980c49548530b3c3ec61b Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 7 Mar 2025 13:42:34 +0100 Subject: [PATCH 3/7] Implement `imas.util.to_xarray` Reuses most of the tensorization and metadata logic from the netCDF export. --- imas/_to_xarray.py | 73 ++++++++++++++++++++ imas/backends/netcdf/ids_tensorizer.py | 21 ++++++ imas/test/test_to_xarray.py | 94 ++++++++++++++++++++++++++ imas/util.py | 56 ++++++++++++++- 4 files changed, 241 insertions(+), 3 deletions(-) create mode 100644 imas/_to_xarray.py create mode 100644 imas/test/test_to_xarray.py diff --git a/imas/_to_xarray.py b/imas/_to_xarray.py new file mode 100644 index 00000000..6caec501 --- /dev/null +++ b/imas/_to_xarray.py @@ -0,0 +1,73 @@ +# xarray is an optional dependency, but this module won't be imported when xarray is not +# available +import numpy +import xarray + +from imas.ids_toplevel import IDSToplevel +from imas.backends.netcdf.ids_tensorizer import IDSTensorizer +from imas.ids_data_type import IDSDataType + +fillvals = { + IDSDataType.INT: -(2**31) + 1, + IDSDataType.STR: "", + IDSDataType.FLT: numpy.nan, + IDSDataType.CPX: numpy.nan * (1 + 1j), +} + + +def to_xarray(ids: IDSToplevel, *paths: str) -> xarray.Dataset: + """See :func:`imas.util.to_xarray`""" + # We really need an IDS toplevel element + if not isinstance(ids, IDSToplevel): + raise TypeError( + f"to_xarray needs a toplevel IDS element as first argument, but got {ids!r}" + ) + + # Valid path can use / or . as separator, but IDSTensorizer expects /. The following + # block checks if the paths are valid, and by using "metadata.path_string" we ensure + # that / are used as separator. + try: + paths = [ids.metadata[path].path_string for path in paths] + except KeyError as exc: + raise ValueError(str(exc)) from None + + # Converting lazy-loaded IDSs requires users to specify at least one path + if ids._lazy and not paths: + raise RuntimeError( + "This IDS is lazy loaded. Please provide at least one path to convert to" + " xarray." + ) + + # Use netcdf IDS Tensorizer to tensorize the data and determine metadata + tensorizer = IDSTensorizer(ids, paths) + tensorizer.include_coordinate_paths() + tensorizer.collect_filled_data() + tensorizer.determine_data_shapes() + + data_vars = {} + coordinate_names = set() + for path in tensorizer.filled_data: + var_name = path.replace("/", ".") + metadata = ids.metadata[path] + if metadata.data_type in (IDSDataType.STRUCTURE, IDSDataType.STRUCT_ARRAY): + continue # We don't store these in xarray + + dimensions = tensorizer.ncmeta.get_dimensions(path, tensorizer.homogeneous_time) + data = tensorizer.tensorize(path, fillvals[metadata.data_type]) + + attrs = dict(documentation=metadata.documentation) + if metadata.units: + attrs["units"] = metadata.units + coordinates = tensorizer.filter_coordinates(path) + if coordinates: + coordinate_names.update(coordinates.split(" ")) + attrs["coordinates"] = coordinates + + data_vars[var_name] = (dimensions, data, attrs) + + # Remove coordinates from data_vars and put in coordinates mapping: + coordinates = {} + for coordinate_name in coordinate_names: + coordinates[coordinate_name] = data_vars.pop(coordinate_name) + + return xarray.Dataset(data_vars, coordinates) diff --git a/imas/backends/netcdf/ids_tensorizer.py b/imas/backends/netcdf/ids_tensorizer.py index 4619919b..95bfba47 100644 --- a/imas/backends/netcdf/ids_tensorizer.py +++ b/imas/backends/netcdf/ids_tensorizer.py @@ -2,6 +2,7 @@ # You should have received the IMAS-Python LICENSE file with this project. """Tensorization logic to convert IDSs to netCDF files and/or xarray Datasets.""" +from collections import deque from typing import List import numpy @@ -53,6 +54,26 @@ def __init__(self, ids: IDSToplevel, paths_to_tensorize: List[str]) -> None: self.shapes = {} """Map of IDS paths to data shape arrays.""" + def include_coordinate_paths(self) -> None: + """Append all paths that are coordinates of self.paths_to_tensorize""" + # Use a queue so we can also take coordinates of coordinates into account + queue = deque(self.paths_to_tensorize) + # Include all parent AoS as well: + for path in self.paths_to_tensorize: + while path: + path, _, _ = path.rpartition("/") + if self.ncmeta.get_dimensions(path, self.homogeneous_time): + queue.append(path) + + self.paths_to_tensorize = [] + while queue: + path = queue.popleft() + if path in self.paths_to_tensorize: + continue # already processed + self.paths_to_tensorize.append(path) + for coordinate in self.ncmeta.get_coordinates(path, self.homogeneous_time): + queue.append(coordinate.replace(".", "/")) + def collect_filled_data(self) -> None: """Collect all filled data in the IDS and determine dimension sizes. diff --git a/imas/test/test_to_xarray.py b/imas/test/test_to_xarray.py new file mode 100644 index 00000000..1767a6d9 --- /dev/null +++ b/imas/test/test_to_xarray.py @@ -0,0 +1,94 @@ +import numpy as np +import pytest + +import imas +import imas.training +from imas.util import to_xarray + +pytest.importorskip("xarray") + + +@pytest.fixture +def entry(requires_imas, monkeypatch): + monkeypatch.setenv("IMAS_VERSION", "3.39.0") # Use fixed DD version + return imas.training.get_training_db_entry() + + +def test_to_xarray_invalid_argtype(): + ids = imas.IDSFactory("3.39.0").core_profiles() + + with pytest.raises(TypeError): + to_xarray("test") + with pytest.raises(TypeError): + to_xarray(ids.time) + with pytest.raises(TypeError): + to_xarray(ids.ids_properties) + + +def test_to_xarray_invalid_paths(): + ids = imas.IDSFactory("3.39.0").core_profiles() + + with pytest.raises(ValueError, match="xyz"): + to_xarray(ids, "xyz") + with pytest.raises(ValueError, match="ids_properties/xyz"): + to_xarray(ids, "ids_properties/xyz") + with pytest.raises(ValueError, match="Xtime"): + to_xarray(ids, "time", "Xtime") + + +def validate_trainingdb_electron_temperature_dataset(ds): + assert ds.sizes == {"time": 3, "profiles_1d.grid.rho_tor_norm:i": 101} + assert ds.data_vars.keys() == {"profiles_1d.electrons.temperature"} + assert ds.coords.keys() == {"time", "profiles_1d.grid.rho_tor_norm"} + + # Check that values are loaded as expected + assert np.allclose(ds["time"], [3.987222, 432.937598, 792.0]) + assert np.allclose( + ds.isel(time=1)["profiles_1d.electrons.temperature"][10:13], + [17728.81703089, 17440.78020568, 17139.35431082], + ) + + +def test_to_xarray_lazy_loaded(entry): + ids = entry.get("core_profiles", lazy=True) + + with pytest.raises(RuntimeError): + to_xarray(ids) + + ds = to_xarray(ids, "profiles_1d.electrons.temperature") + validate_trainingdb_electron_temperature_dataset(ds) + + +def test_to_xarray_from_trainingdb(entry): + ids = entry.get("core_profiles") + + ds = to_xarray(ids) + validate_trainingdb_electron_temperature_dataset( + ds["profiles_1d.electrons.temperature"].to_dataset() + ) + ds = to_xarray(ids, "profiles_1d.electrons.temperature") + validate_trainingdb_electron_temperature_dataset(ds) + + ds = to_xarray( + ids, "profiles_1d.electrons.temperature", "profiles_1d/electrons/density" + ) + assert ds.data_vars.keys() == { + "profiles_1d.electrons.temperature", + "profiles_1d.electrons.density", + } + + +def test_to_xarray(): + ids = imas.IDSFactory("3.39.0").core_profiles() + + ids.profiles_1d.resize(2) + ids.profiles_1d[0].electrons.temperature = [1.0, 2.0] + ids.profiles_1d[0].grid.rho_tor_norm = [0.0, 1.0] + ids.profiles_1d[0].time = 0.0 + + # These should all be identical: + ds1 = to_xarray(ids) + ds2 = to_xarray(ids, "profiles_1d.electrons.temperature") + ds3 = to_xarray(ids, "profiles_1d/electrons/temperature") + assert ds1.equals(ds2) + assert ds2.equals(ds3) diff --git a/imas/util.py b/imas/util.py index aafad2c7..64e2b228 100644 --- a/imas/util.py +++ b/imas/util.py @@ -1,8 +1,6 @@ # This file is part of IMAS-Python. # You should have received the IMAS-Python LICENSE file with this project. -"""Collection of useful helper methods when working with IMAS-Python. -""" - +"""Collection of useful helper methods when working with IMAS-Python.""" import logging import re @@ -524,3 +522,55 @@ def get_data_dictionary_version(obj: Union[IDSBase, DBEntry, IDSFactory]) -> str if isinstance(obj, IDSBase): return obj._version raise TypeError(f"Cannot get data dictionary version of '{type(obj)}'") + + +def to_xarray(ids: IDSToplevel, *paths: str) -> Any: + """Convert an IDS to an xarray Dataset. + + Args: + ids: An IDS toplevel element + paths: Optional list of element paths to convert to xarray. The full IDS will be + converted to an xarray Dataset if no paths are provided. + + Paths must not contain indices, and may use a ``/`` or a ``.`` as separator. + For example, ``"profiles_1d(itime)/electrons/density"`` is not allowed as + path, use ``"profiles_1d/electrons/density"`` or + ``profiles_1d.electrons.density"`` instead. + + Coordinates to the quantities in the requested paths will also be included + in the xarray Dataset. + + Returns: + An ``xarray.Dataset`` object. + + Examples: + .. code-block:: python + + # Convert the whole IDS to an xarray Dataset + ds = imas.util.to_xarray(ids) + + # Convert only some elements in the IDS (including their coordinates) + ds = imas.util.to_xarray( + ids, + "profiles_1d/electrons/density", + "profiles_1d/electrons/temperature", + ) + + # Paths can be provided with "/" or "." as separator + ds = imas.util.to_xarray( + ids, + "profiles_1d.electrons.density", + "profiles_1d.electrons.temperature", + ) + + See Also: + https://docs.xarray.dev/en/stable/generated/xarray.Dataset.html + """ + try: + import xarray # noqa: F401 + except ImportError: + raise RuntimeError("xarray is not available, cannot convert the IDS to xarray.") + + from imas._to_xarray import to_xarray + + return to_xarray(ids, *paths) From bd153735f0adaea0fddf4327281fa4ae4d6f1608 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 7 Mar 2025 13:43:01 +0100 Subject: [PATCH 4/7] Add `xarray` as optional imas-python dependency --- .github/workflows/test_with_pytest.yml | 2 +- pyproject.toml | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test_with_pytest.yml b/.github/workflows/test_with_pytest.yml index 4febc7a3..7e56ac38 100644 --- a/.github/workflows/test_with_pytest.yml +++ b/.github/workflows/test_with_pytest.yml @@ -28,7 +28,7 @@ jobs: python -m venv venv source venv/bin/activate pip install --upgrade pip setuptools wheel - pip install .[h5py,netcdf,test] + pip install .[test] - name: Run tests run: | diff --git a/pyproject.toml b/pyproject.toml index 1b1b86c3..56e6dc1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,8 +71,7 @@ dependencies = [ [project.optional-dependencies] # these self-dependencies are available since pip 21.2 all = [ - "imas-python[test,docs,netcdf,h5py]" - # "imas-python[test,docs,imas-core,netcdf,h5py]" TODO enable when imas-core is available on pypi + "imas-python[test,docs]" ] docs = [ "sphinx>=6.0.0,<7.0.0", @@ -90,6 +89,9 @@ netcdf = [ h5py = [ "h5py", ] +xarray = [ + "xarray", +] test = [ "pytest>=5.4.1", "pytest-cov>=0.6", @@ -101,7 +103,9 @@ test = [ "virtualenv", # Pint and xarray are used in training snippets "pint", - "xarray", + # Optional dependencies + # TODO add imas-core when it is available on pypi + "imas-python[netcdf,h5py,xarray]", ] [project.scripts] From 334268ffa40f7602a18856bd6b1666c868d1a599 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 10 Mar 2025 13:58:57 +0100 Subject: [PATCH 5/7] Update xarray advanced course to mention imas.util.to_xarray --- .../imas_snippets/tensorized_ids_to_xarray.py | 34 +++++++++++++++++++ docs/source/courses/advanced/xarray.rst | 16 +++++++-- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/docs/source/courses/advanced/imas_snippets/tensorized_ids_to_xarray.py b/docs/source/courses/advanced/imas_snippets/tensorized_ids_to_xarray.py index ff4f4e28..559b27a7 100644 --- a/docs/source/courses/advanced/imas_snippets/tensorized_ids_to_xarray.py +++ b/docs/source/courses/advanced/imas_snippets/tensorized_ids_to_xarray.py @@ -1,6 +1,7 @@ import os import matplotlib + # To avoid possible display issues when Matplotlib uses a non-GUI backend if "DISPLAY" not in os.environ: matplotlib.use("agg") @@ -17,6 +18,39 @@ entry = imas.training.get_training_db_entry() cp = entry.get("core_profiles") +####################################################################################### +# Steps 2, 3 and 4, using imas.util.to_xarray +# Create an xarray Dataset containing t_i_average and its coordinates +xrds = imas.util.to_xarray(cp, "profiles_1d/t_i_average") +# Note that profiles_1d.grid.rho_tor_norm is a 2D coordinate: its values may be +# different at different times. +# +# Since the values at different time slices differ only minutely in this example, we'll +# rename the `profiles_1d.grid.rho_tor_norm:i` dimension to `rho_tor_norm` and set the +# values to the values of rho_tor_norm of the first time slice: +xrds = xrds.rename({"profiles_1d.grid.rho_tor_norm:i": "rho_tor_norm"}).assign_coords( + {"rho_tor_norm": xrds["profiles_1d.grid.rho_tor_norm"].isel(time=0).data} +) + +# Extract temperatures as an xarray DataArray +temperature = xrds["profiles_1d.t_i_average"] + +# 5a. Select subset of temperature where 0.4 <= rho_tor_norm < 0.6: +print(temperature.sel(rho_tor_norm=slice(0.4, 0.6))) + +# 5b. Interpolate temperature on a new grid: [0, 0.1, 0.2, ..., 0.9, 1.0] +print(temperature.interp(rho_tor_norm=numpy.linspace(0, 1, 11))) + +# 5c. Interpolate temperature on a new time base: [10, 20] +print(temperature.interp(time=[10, 20])) + +# 5d. Plot +temperature.plot(x="time", norm=matplotlib.colors.LogNorm()) +plt.show() + +####################################################################################### +# We can also manually build an xarray DataArray, this is shown below: + # 2. Store the temperature of the first time slice temperature = cp.profiles_1d[0].t_i_average diff --git a/docs/source/courses/advanced/xarray.rst b/docs/source/courses/advanced/xarray.rst index f28b452b..d1375a45 100644 --- a/docs/source/courses/advanced/xarray.rst +++ b/docs/source/courses/advanced/xarray.rst @@ -3,9 +3,10 @@ Create ``xarray.DataArray`` from an IDS .. info:: - In this lesson you will create a ``DataArray`` manually. In a future version of - IMAS-Python we plan to include functionality that will automatically do this for you. - That should further simplify working with data inside IDSs. + This lesson was written before :py:func:`imas.util.to_xarray` was + implemented. This lesson is retained for educational purposes, however we + recommend to use :py:func:`imas.util.to_xarray` instead of manually creating + xarray ``DataArray``\ s. Let's start with an introduction of Xarray. According to `their website `_ (where you @@ -61,6 +62,10 @@ Exercise 1: create a ``DataArray`` for ``profiles_1d/temperature`` .. md-tab-item:: Solution + This exercise was created before the implementation of + :py:func:`imas.util.to_xarray`. The original approach is available below + for educational purposes. + .. literalinclude:: imas_snippets/ids_to_xarray.py @@ -96,4 +101,9 @@ the ``profiles_1d`` array of structures. When the grid is not changing in the ID .. md-tab-item:: Solution + This exercise was created before the implementation of + :py:func:`imas.util.to_xarray`. Below code sample is updated to provide + two alternatives: the first is based on :py:func:`imas.util.to_xarray`, + the second is the original, manual approach. + .. literalinclude:: imas_snippets/tensorized_ids_to_xarray.py From 10d30ec6f327cdcb5677c6ba4280ad87fb1e96b6 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 10 Mar 2025 15:00:37 +0100 Subject: [PATCH 6/7] Additional documentation for `imas.util.to_xarray` --- docs/source/courses/advanced/xarray.rst | 2 + docs/source/netcdf.rst | 49 ++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/docs/source/courses/advanced/xarray.rst b/docs/source/courses/advanced/xarray.rst index d1375a45..249520d0 100644 --- a/docs/source/courses/advanced/xarray.rst +++ b/docs/source/courses/advanced/xarray.rst @@ -8,6 +8,8 @@ Create ``xarray.DataArray`` from an IDS recommend to use :py:func:`imas.util.to_xarray` instead of manually creating xarray ``DataArray``\ s. + See also: :ref:`Convert IMAS-Python IDSs directly to Xarray Datasets`. + Let's start with an introduction of Xarray. According to `their website `_ (where you can also find an excellent summary of why that is useful): diff --git a/docs/source/netcdf.rst b/docs/source/netcdf.rst index fb85ea23..868ae429 100644 --- a/docs/source/netcdf.rst +++ b/docs/source/netcdf.rst @@ -1,7 +1,7 @@ .. _`IMAS netCDF files`: -IMAS netCDF files -================= +IMAS netCDF files \& Xarray +=========================== .. toctree:: :hidden: @@ -69,6 +69,7 @@ features that are supported by DBEntries using ``imas_core`` respectively - Yes (requires ``imas_core >= 5.4.0``) - Not implemented +.. _`Using IMAS netCDF files with 3rd-party tools`: Using IMAS netCDF files with 3rd-party tools -------------------------------------------- @@ -138,3 +139,47 @@ Validating an IMAS netCDF file IMAS netCDF files can be validated with IMAS-Python through the command line ``imas validate_nc ``. See also :ref:`IMAS-Python Command Line tool` or type ``imas validate_nc --help`` in a command line. + + +.. _`Convert IMAS-Python IDSs directly to Xarray Datasets`: + +Convert IMAS-Python IDSs directly to Xarray Datasets +---------------------------------------------------- + +In the section :ref:`Using IMAS netCDF files with 3rd-party tools`, we showed +how to open an IMAS netCDF file with Xarray. However, IMAS-Python IDSs can also +be converted directly to Xarray ``Dataset``\ s with +:py:func:`imas.util.to_xarray`. + +This method can be used to convert a full IDS to an Xarray ``Dataset``, or only +specific paths inside the IDS. The latter variant can also be combined with +:ref:`lazy loading`. We'll show a small example below: + +.. code-block:: python + :caption: Converting a lazy loaded IDS to Xarray + + import imas.training + + # Open the training entry + with imas.training.get_training_db_entry() as training_entry: + # Lazy load the core_profiles IDS + core_profiles = training_entry.get("core_profiles", lazy=True) + # Load the average ion temperature and all coordinate data + xrds = imas.util.to_xarray(core_profiles, "profiles_1d.t_i_average") + # All relevant data is now loaded from the data entry into the xarray + # Dataset. We close the data entry by exiting the with-statement. + + # Inspect what's inside the dataset + print(xrds.data_vars) + # Data variables: + # profiles_1d.t_i_average + + # Included coordinates depends on the used Data Dictionary version + print(xrds.coords) + # Coordinates: (with DD 4.0.0) + # * time + # profiles_1d.grid.area + # profiles_1d.grid.volume + # profiles_1d.grid.rho_tor + # profiles_1d.grid.rho_tor_norm + # profiles_1d.grid.psi From f44bcbbeff763724c7923fc5a4415c6b125a456e Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 13 Mar 2025 11:33:06 +0100 Subject: [PATCH 7/7] Additional documentation and example for to_xarray --- imas/util.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/imas/util.py b/imas/util.py index 64e2b228..e41d28a2 100644 --- a/imas/util.py +++ b/imas/util.py @@ -543,6 +543,22 @@ def to_xarray(ids: IDSToplevel, *paths: str) -> Any: Returns: An ``xarray.Dataset`` object. + Notes: + - Lazy loaded IDSs are not supported for full IDS conversion + (``imas.util.to_xarray(ids)`` will raise an exception for lazy loaded IDSs). + This function can work with lazy loaded IDSs when paths are explicitly + provided: this might take a while because it will load all data for the + provided paths and their coordinates. + - This function does not accept wildcards for the paths. However, it is possible + to combine this method with :py:func:`imas.util.find_paths`, see the Examples + below. + - This function may return an empty dataset in the following cases: + + - The provided IDS does not contain any data. + - The IDS does not contain any data for the provided paths. + - The provided paths do not point to data nodes, but to (arrays of) + structures. + Examples: .. code-block:: python @@ -563,6 +579,12 @@ def to_xarray(ids: IDSToplevel, *paths: str) -> Any: "profiles_1d.electrons.temperature", ) + # Combine with imas.util.find_paths to include all paths containing + # "profiles_1d" in the xarray conversion: + profiles_1d_paths = imas.util.find_paths(ids, "profiles_1d") + assert len(profiles_1d_paths) > 0 + ds = imas.util.to_xarray(ids, *profiles_1d_paths) + See Also: https://docs.xarray.dev/en/stable/generated/xarray.Dataset.html """