diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 69bab7ad..698c0cee 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,7 +10,7 @@ Contributor to this version: Ludwig Lierhammer (:user:`ludwiglierhammer`) Announcements ^^^^^^^^^^^^^ * `cdm_reader_mapper` now drops support for Python 3.10 (:pull:`419`) -* `cdm_reader_mapper` now uses `cruft ` and the `Ouranosinc cookiecutter template ` (:issue:`369`, :pull:`419`) +* `cdm_reader_mapper` now uses `cruft `_ and the `Ouranosinc cookiecutter template `_ (:issue:`369`, :pull:`419`) New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -20,7 +20,7 @@ Breaking changes ^^^^^^^^^^^^^^^^ * Development dependencies ("dev", "docs") are now installed via the new `dependency-groups` conventions (`PEP 735 `_) (:pull:`419`) * `prek` is now the suggested pre-commit runner (installed by default via `pip install --group dev`) (:pull:`419`) -* delete submodule ``src.cdm_reader_mapper.duplicates`` (:issue:`152`, :issue:`283`, :pull:`434`) +* submodule ``src.cdm_reader_mapper.duplicates`` has been deleted and moved to `marine_qc `_ (:issue:`283`, :pull:`434`) * ``cdm_reader_mapper.DupDetect`` is not importable anymore * ``cdm_reader_mapper.duplicate_check`` is not importable anymore @@ -30,7 +30,8 @@ Breaking changes * ``cdm_reader_mapper.DataBundle.remove_duplicates`` is not callable anymore * ``cdm_reader_mapper.DataBundle`` does not have attribute ``DupDetect`` anymore -* submodule ``src.cdm_reader_mapper.duplicates`` has been moved to `marine_qc `_ (:issue:`283`, :pull:`434`) +* the code has been updated to support pandas >= 3.0.0 (:pull:`435`) +* reader functions now return object types instead of string types (:pull:`435`) Internal changes @@ -53,6 +54,8 @@ Internal changes * remove helper class `core._utilities._DataBundle` and integrate it in `core.databundle.DataBundle` (:pull:`419`) * make use of `pathlib.Path` instead of `os.path` (:pull:`419`) * use consistently parameter "imodel" instead of "data_model" and "correction_method" instead of "fix_method" in `metmetpy` modules (:pull:`419`) +* make some JSON and iterator utility functions directly importable in submodule ``cdm_reader_mapper.common`` (:pull:`435`) +* replace relative imports with absolute imports for any submodules located one or more directories above the current submodule (:pull:`435`) 2.4.1 (2016-04-16) diff --git a/environment-docs.yml b/environment-docs.yml index e67b4c5b..b9676e4f 100755 --- a/environment-docs.yml +++ b/environment-docs.yml @@ -12,7 +12,7 @@ dependencies: - h5netcdf >=1.6.1 - h5py >=3.13.0 - numpy >=2.1.3 - - pandas >=2.2.0 + - pandas >=3.0.0 - pyarrow >=15.0.0 # Strongly encouraged for Pandas v2.2.0+ - timezonefinder >6.5.0 - xarray >=2023.11.0,!=2024.10.0 diff --git a/pyproject.toml b/pyproject.toml index 22dab2e4..f4e4acfc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,7 +93,7 @@ dependencies = [ "h5netcdf >=1.6.1", "h5py >= 3.13.0", "numpy>=2.1.3", - "pandas>=2.2.0", + "pandas>=3.0.0", "platformdirs >4.0.0", "pyarrow >=15.0.0", "requests", diff --git a/src/cdm_reader_mapper/cdm_mapper/codes/codes.py b/src/cdm_reader_mapper/cdm_mapper/codes/codes.py index e3388a67..a9a0c058 100755 --- a/src/cdm_reader_mapper/cdm_mapper/codes/codes.py +++ b/src/cdm_reader_mapper/cdm_mapper/codes/codes.py @@ -13,14 +13,13 @@ from pathlib import Path from typing import Any -from cdm_reader_mapper.common.json_dict import ( +from cdm_reader_mapper.cdm_mapper import properties +from cdm_reader_mapper.common import ( collect_json_files, combine_dicts, open_json_file, ) -from .. import properties - def _eval(s: str) -> Any: """ diff --git a/src/cdm_reader_mapper/cdm_mapper/mapper.py b/src/cdm_reader_mapper/cdm_mapper/mapper.py index 712823db..3a8f28ab 100755 --- a/src/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/src/cdm_reader_mapper/cdm_mapper/mapper.py @@ -18,11 +18,12 @@ import pandas as pd -from cdm_reader_mapper.common import logging_hdlr -from cdm_reader_mapper.common.iterators import ( +from cdm_reader_mapper.common import ( ParquetStreamReader, ProcessFunction, + logging_hdlr, process_function, + standardize_object_columns, ) from . import properties @@ -65,14 +66,14 @@ def _drop_duplicated_rows(df: pd.DataFrame) -> pd.DataFrame: Parameters ---------- df : pd.DataFrame - Input DataFrame to drop duplictaed rows. + Input DataFrame to drop duplicated rows. Returns ------- pd.DataFrame Input DataFrame with deleted duplicated rows. """ - list_cols = [col for col in df.columns if df[col].apply(lambda x: isinstance(x, list)).any()] + list_cols = [col for col in df.columns if df[col].dtype == "object" and df[col].apply(lambda x: isinstance(x, list)).any()] for col in list_cols: df[col] = df[col].apply(lambda x: tuple(x) if isinstance(x, list) else x) @@ -481,7 +482,7 @@ def _table_mapping( if drop_duplicates: table_df = _drop_duplicated_rows(table_df) - return table_df + return standardize_object_columns(table_df) def _prepare_cdm_tables(cdm_subset: str | Sequence[str]) -> dict[str, Any]: diff --git a/src/cdm_reader_mapper/cdm_mapper/properties.py b/src/cdm_reader_mapper/cdm_mapper/properties.py index f122ca61..bcf62394 100755 --- a/src/cdm_reader_mapper/cdm_mapper/properties.py +++ b/src/cdm_reader_mapper/cdm_mapper/properties.py @@ -2,7 +2,7 @@ from __future__ import annotations -from ..properties import SupportedDataModels, SupportedFileTypes +from cdm_reader_mapper.properties import SupportedDataModels, SupportedFileTypes __all__ = [ @@ -27,7 +27,7 @@ "observations-slp", ] -# ...from CDM table definitions psuedo-sql(...) -------------------------------- +# ...from CDM table definitions pseudo-sql(...) -------------------------------- pandas_dtypes: dict[str, dict[str, str]] = {} pandas_dtypes["from_sql"] = {} pandas_dtypes["from_sql"]["timestamp with timezone"] = "object" diff --git a/src/cdm_reader_mapper/cdm_mapper/reader.py b/src/cdm_reader_mapper/cdm_mapper/reader.py index 8d5b0837..45282b9c 100755 --- a/src/cdm_reader_mapper/cdm_mapper/reader.py +++ b/src/cdm_reader_mapper/cdm_mapper/reader.py @@ -51,11 +51,10 @@ import pandas as pd -from cdm_reader_mapper.common import get_filename, logging_hdlr +from cdm_reader_mapper.common import get_filename, logging_hdlr, standardize_object_columns from cdm_reader_mapper.core.databundle import DataBundle -from ..properties import SupportedFileTypes -from .properties import cdm_tables +from .properties import SupportedFileTypes, cdm_tables from .utils.conversions import convert_from_str_df, convert_to_str_df from .utils.utilities import get_cdm_subset, get_usecols @@ -439,4 +438,6 @@ def read_tables( elif to_str is True: merged = convert_to_str_df(merged, imodel, cdm_subset=cdm_subset) - return DataBundle(data=merged, columns=merged.columns, mode="tables") + data = standardize_object_columns(merged) + + return DataBundle(data=data, columns=merged.columns, mode="tables") diff --git a/src/cdm_reader_mapper/cdm_mapper/tables/tables.py b/src/cdm_reader_mapper/cdm_mapper/tables/tables.py index 8c58395e..a9a44742 100755 --- a/src/cdm_reader_mapper/cdm_mapper/tables/tables.py +++ b/src/cdm_reader_mapper/cdm_mapper/tables/tables.py @@ -15,14 +15,13 @@ from pathlib import Path from typing import Any -from cdm_reader_mapper.common.json_dict import ( +from cdm_reader_mapper.cdm_mapper import properties +from cdm_reader_mapper.common import ( collect_json_files, combine_dicts, open_json_file, ) -from .. import properties - def get_cdm_atts( cdm_tables: str | tuple[str, str] | Sequence[str | tuple[str, str]] | None = None, diff --git a/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py b/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py index c919d364..32ade61b 100755 --- a/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py +++ b/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd -from .. import properties -from ..tables.tables import get_cdm_atts, get_imodel_maps +from cdm_reader_mapper.cdm_mapper import properties +from cdm_reader_mapper.cdm_mapper.tables.tables import get_cdm_atts, get_imodel_maps class BaseConverter: @@ -269,7 +269,7 @@ def _convert_value(x: Any) -> str: return "{" + ",".join(str_list) + "}" - return data.apply(_convert_value).astype(object) + return data.apply(_convert_value) def _convert_str_to_str(data: pd.Series, null_label: str) -> pd.Series: @@ -311,7 +311,7 @@ def _return_str(x: Any, null_label: str) -> str: return null_label return str(x) - return data.apply(lambda x: _return_str(x, null_label)) + return data.apply(lambda x: _return_str(x, null_label)).astype(object) def _convert_str_from_str(data: pd.Series, null_label: str) -> pd.Series: @@ -372,7 +372,7 @@ def _convert_str_array_to_str(data: pd.Series, null_label: str) -> pd.Series: pd.Series Series with integer arrays in "{...}" format. """ - return _convert_array_general_to_str(data, null_label, dtype=str) + return _convert_array_general_to_str(data, null_label, dtype=object) def _convert_str_array_from_str(data: pd.Series, null_label: str) -> pd.Series: @@ -662,7 +662,10 @@ def _convert_datetime_from_str(data: pd.Series, null_label: str) -> pd.Series: Series with values converted to pandas datetime dtype (datetime64[ns]). Invalid or non-convertible values are set to NaT. """ - return pd.to_datetime(data, errors="coerce") + dt = pd.to_datetime(data, errors="coerce") + if dt.dt.tz is not None: + dt = dt.dt.tz_localize(None) + return dt.astype("datetime64[ns]") def _convert_column( @@ -832,6 +835,7 @@ def _convert_columns( null_label, converters, ) + return data diff --git a/src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py b/src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py index ef5abcd6..fe74b193 100755 --- a/src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py +++ b/src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py @@ -369,32 +369,31 @@ def datetime_imma1(self, df: pd.DataFrame) -> pd.DatetimeIndex: DatetimeIndex of converted timestamps. """ if df.empty: - return pd.DatetimeIndex([]) + return pd.DatetimeIndex([], dtype="datetime64[ns]") df = df.iloc[:, 0:4] date_format = "%Y-%m-%d-%H-%M" hr_ = df.columns[-1] df = df.assign(HR=df.iloc[:, -1]) df["M"] = df["HR"].copy() - df = df.drop(columns=hr_, axis=1) - + df = df.drop(columns=hr_) hr_min = df.apply(lambda x: self.datetime_decimalhour_to_hm(x), axis=1) df["HR"] = hr_min["HR"] df["M"] = hr_min["M"] df = df.apply(lambda col: col.map(to_int)) - strings = df.astype(str).apply("-".join, axis=1).values + strings = df.apply(lambda row: "-".join(map(str, row)), axis=1).values result = pd.to_datetime( strings, format=date_format, errors="coerce", ) result.index = df.index - return result + return result.astype("datetime64[ns]") def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex: """ - Convert to pandas datetime object for IMMA1 deck 701 format. + Convert to pandas datetime object to UTC time. Set missing hour to 12 and use latitude and longitude information to convert local midday to UTC time. @@ -402,7 +401,7 @@ def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex: Parameters ---------- df : pd.DataFrame - IMMA1 deck 701 dataset containing year, month, day, latitude, and longitude. + IMMA1 dataset containing year, month, day, latitude, and longitude. Returns ------- @@ -410,7 +409,7 @@ def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex: DatetimeIndex with timestamps converted to UTC. """ if df.empty: - return pd.DatetimeIndex([]) + return pd.DatetimeIndex([], dtype="datetime64[ns]") date_format = "%Y-%m-%d-%H-%M" @@ -437,7 +436,7 @@ def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex: results = df_time.apply(lambda x: convert_to_utc_i(x["Dates"], x["Time_zone"]), axis=1) results.index = df.index - return pd.DatetimeIndex(results.dt.tz_convert(None)) + return pd.DatetimeIndex(results.dt.tz_convert(None), dtype="datetime64[ns]") def datetime_imma1_701(self, df: pd.DataFrame) -> pd.DatetimeIndex: """ @@ -454,7 +453,7 @@ def datetime_imma1_701(self, df: pd.DataFrame) -> pd.DatetimeIndex: DatetimeIndex with converted timestamps. """ if df.empty: - return pd.DatetimeIndex([]) + return pd.DatetimeIndex([], dtype="datetime64[ns]") hr = df.iloc[:, 3] valid_mask = hr.notna() @@ -484,7 +483,7 @@ def datetime_immt(self, df: pd.DataFrame) -> pd.DatetimeIndex: DatetimeIndex of converted timestamps. """ if df.empty: - return pd.DatetimeIndex([]) + return pd.DatetimeIndex([], dtype="datetime64[ns]") date_format = "%Y-%m-%d-%H-%M" df = df.copy() @@ -496,7 +495,7 @@ def datetime_immt(self, df: pd.DataFrame) -> pd.DatetimeIndex: format=date_format, errors="coerce", ) - return pd.DatetimeIndex(result) + return pd.DatetimeIndex(result).astype("datetime64[ns]") def datetime_utcnow(self, df: pd.DataFrame) -> datetime.datetime: """ @@ -566,7 +565,7 @@ def datetime_marob(self, series: pd.Series, format: str = "%Y-%m-%dT%H:%M:%S") - pd.Series Series of converted dates. """ - return series_strptime(series, format) + return series_strptime(series, format).astype("datetime64[ns]") def df_col_join(self, df: pd.DataFrame, sep: str) -> pd.Series: """ @@ -819,7 +818,6 @@ def string_add( Series with modified string values. """ result = np.vectorize(string_add_i, otypes="O")(prepend, series, append, separator) - return pd.Series(result, index=series.index, dtype="object") def string_join_add( @@ -1016,7 +1014,7 @@ def gdac_uid(self, df: pd.DataFrame, prepend: str = "", append: str = "") -> pd. for i, n in enumerate(name): uid[i] = str(prepend) + uuid.uuid5(uuid.NAMESPACE_OID, str(n)).hex + str(append) df["UUID"] = uid - return df["UUID"] + return df["UUID"].astype(object) def gdac_latitude(self, df: pd.DataFrame) -> pd.Series: """ diff --git a/src/cdm_reader_mapper/cdm_mapper/utils/utilities.py b/src/cdm_reader_mapper/cdm_mapper/utils/utilities.py index 6f405cbd..21529665 100755 --- a/src/cdm_reader_mapper/cdm_mapper/utils/utilities.py +++ b/src/cdm_reader_mapper/cdm_mapper/utils/utilities.py @@ -4,7 +4,7 @@ from collections.abc import Iterable from typing import Any -from .. import properties +from cdm_reader_mapper.cdm_mapper import properties def dict_to_tuple_list(dic: dict[Any, Any]) -> list[tuple[Any, Any]]: diff --git a/src/cdm_reader_mapper/cdm_mapper/writer.py b/src/cdm_reader_mapper/cdm_mapper/writer.py index 64288e29..1ece36fc 100755 --- a/src/cdm_reader_mapper/cdm_mapper/writer.py +++ b/src/cdm_reader_mapper/cdm_mapper/writer.py @@ -29,7 +29,7 @@ from cdm_reader_mapper.common import get_filename, logging_hdlr -from ..properties import SupportedFileTypes +from .properties import SupportedFileTypes from .tables.tables import get_cdm_atts from .utils.conversions import convert_from_str_df, convert_to_str_df from .utils.utilities import adjust_filename, dict_to_tuple_list, get_cdm_subset diff --git a/src/cdm_reader_mapper/common/__init__.py b/src/cdm_reader_mapper/common/__init__.py index 22a68549..ff175e2f 100755 --- a/src/cdm_reader_mapper/common/__init__.py +++ b/src/cdm_reader_mapper/common/__init__.py @@ -2,10 +2,12 @@ from __future__ import annotations -from . import json_dict from .getting_files import load_file from .inspect import count_by_cat, get_length from .io_files import get_filename +from .iterators import ParquetStreamReader, ProcessFunction, is_valid_iterator, parquet_stream_from_iterable, process_disk_backed, process_function +from .json_dict import collect_json_files, combine_dicts, open_json_file +from .object_types import standardize_object_columns from .replace import replace_columns from .select import ( split_by_boolean, @@ -17,15 +19,24 @@ __all__ = [ + "ParquetStreamReader", + "ProcessFunction", + "collect_json_files", + "combine_dicts", "count_by_cat", "get_filename", "get_length", - "json_dict", + "is_valid_iterator", "load_file", + "open_json_file", + "parquet_stream_from_iterable", + "process_disk_backed", + "process_function", "replace_columns", "split_by_boolean", "split_by_boolean_false", "split_by_boolean_true", "split_by_column_entries", "split_by_index", + "standardize_object_columns", ] diff --git a/src/cdm_reader_mapper/common/iterators.py b/src/cdm_reader_mapper/common/iterators.py index de1bb350..94e97bb2 100755 --- a/src/cdm_reader_mapper/common/iterators.py +++ b/src/cdm_reader_mapper/common/iterators.py @@ -18,6 +18,8 @@ import pyarrow.parquet as pq import xarray as xr +from .object_types import standardize_object_columns + class ProcessFunction: r""" @@ -420,6 +422,9 @@ def __exit__(self, _exc_type: type | None, _exc_val: BaseException | None, _exc_ self.close() +ParquetStreamReader.__module__ = "cdm_reader_mapper.common" + + def _sort_chunk_outputs( outputs: tuple[Any, ...], capture_meta: bool, requested_types: tuple[type, ...] ) -> tuple[list[pd.DataFrame | pd.Series], list[Any]]: @@ -558,6 +563,8 @@ def _parquet_generator(temp_dir: TemporaryDirectory[str], data_type: type, schem for f in files: df = pd.read_parquet(f) + df = standardize_object_columns(df) + if data_type is pd.Series: s = df.iloc[:, 0].copy() s.name = schema diff --git a/src/cdm_reader_mapper/common/object_types.py b/src/cdm_reader_mapper/common/object_types.py new file mode 100755 index 00000000..afe4e18e --- /dev/null +++ b/src/cdm_reader_mapper/common/object_types.py @@ -0,0 +1,27 @@ +"""Utility function for reading and writing files.""" + +from __future__ import annotations + +import pandas as pd + + +def standardize_object_columns(df: pd.DataFrame) -> pd.DataFrame: + """ + Convert string columns to object dtype and replace NaNs with None. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame to be standardized. + + Returns + ------- + pd.DataFrame + The same DataFrame instance after the dtype conversion and NaN handling. + """ + df = df.copy() + string_cols = df.select_dtypes(include="string").columns + df[string_cols] = df[string_cols].astype(object) + object_cols = df.select_dtypes(include="object").columns + df[object_cols] = df[object_cols].fillna(None) + return df diff --git a/src/cdm_reader_mapper/core/_utilities.py b/src/cdm_reader_mapper/core/_utilities.py index 7c6dff74..fdffc430 100755 --- a/src/cdm_reader_mapper/core/_utilities.py +++ b/src/cdm_reader_mapper/core/_utilities.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from cdm_reader_mapper.common.iterators import ( +from cdm_reader_mapper.common import ( ParquetStreamReader, is_valid_iterator, parquet_stream_from_iterable, diff --git a/src/cdm_reader_mapper/core/databundle.py b/src/cdm_reader_mapper/core/databundle.py index 4b3f20a8..985724c7 100755 --- a/src/cdm_reader_mapper/core/databundle.py +++ b/src/cdm_reader_mapper/core/databundle.py @@ -8,15 +8,16 @@ from cdm_reader_mapper.cdm_mapper.mapper import map_model from cdm_reader_mapper.common import ( + ParquetStreamReader, count_by_cat, get_length, + is_valid_iterator, replace_columns, split_by_boolean_false, split_by_boolean_true, split_by_column_entries, split_by_index, ) -from cdm_reader_mapper.common.iterators import ParquetStreamReader, is_valid_iterator from cdm_reader_mapper.metmetpy import ( correct_datetime, correct_pt, diff --git a/src/cdm_reader_mapper/core/reader.py b/src/cdm_reader_mapper/core/reader.py index a7d0efa9..d2ca72c6 100755 --- a/src/cdm_reader_mapper/core/reader.py +++ b/src/cdm_reader_mapper/core/reader.py @@ -6,8 +6,8 @@ from cdm_reader_mapper.cdm_mapper.reader import read_tables from cdm_reader_mapper.mdf_reader.reader import read_data, read_mdf +from cdm_reader_mapper.properties import SupportedReadModes -from ..properties import SupportedReadModes from .databundle import DataBundle diff --git a/src/cdm_reader_mapper/core/writer.py b/src/cdm_reader_mapper/core/writer.py index 49dba472..097217e2 100755 --- a/src/cdm_reader_mapper/core/writer.py +++ b/src/cdm_reader_mapper/core/writer.py @@ -8,8 +8,7 @@ from cdm_reader_mapper.cdm_mapper.writer import write_tables from cdm_reader_mapper.mdf_reader.writer import write_data - -from ..properties import SupportedWriteModes +from cdm_reader_mapper.properties import SupportedWriteModes supported_write_modes = get_args(SupportedWriteModes) diff --git a/src/cdm_reader_mapper/mdf_reader/codes/codes.py b/src/cdm_reader_mapper/mdf_reader/codes/codes.py index a8786913..0501f166 100755 --- a/src/cdm_reader_mapper/mdf_reader/codes/codes.py +++ b/src/cdm_reader_mapper/mdf_reader/codes/codes.py @@ -8,13 +8,12 @@ from pathlib import Path from typing import Any -from cdm_reader_mapper.common.json_dict import ( +from cdm_reader_mapper.common import ( collect_json_files, combine_dicts, open_json_file, ) - -from .. import properties +from cdm_reader_mapper.mdf_reader import properties def read_table( diff --git a/src/cdm_reader_mapper/mdf_reader/properties.py b/src/cdm_reader_mapper/mdf_reader/properties.py index 1cb9ade1..a934a368 100755 --- a/src/cdm_reader_mapper/mdf_reader/properties.py +++ b/src/cdm_reader_mapper/mdf_reader/properties.py @@ -3,7 +3,7 @@ from __future__ import annotations from typing import get_args -from ..properties import NumericTypes, ObjectTypes, SupportedDataModels +from cdm_reader_mapper.properties import NumericTypes, ObjectTypes, SupportedDataModels __all__ = [ diff --git a/src/cdm_reader_mapper/mdf_reader/reader.py b/src/cdm_reader_mapper/mdf_reader/reader.py index 0c85d4c3..ec494e0f 100755 --- a/src/cdm_reader_mapper/mdf_reader/reader.py +++ b/src/cdm_reader_mapper/mdf_reader/reader.py @@ -8,9 +8,9 @@ import pandas as pd from cdm_reader_mapper import DataBundle +from cdm_reader_mapper.common import open_json_file, standardize_object_columns +from cdm_reader_mapper.properties import SupportedFileTypes -from ..common.json_dict import open_json_file -from ..properties import SupportedFileTypes from .utils.filereader import FileReader from .utils.utilities import as_list, as_path, read_csv, read_feather, read_parquet, validate_arg @@ -302,6 +302,11 @@ def _read_data( **mask_kwargs, ) + data = standardize_object_columns(data) + + if "dtypes" in info: + info["dtypes"] = info["dtypes"].replace("str", "object") + return data, mask, info diff --git a/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py b/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py index 6ae0e845..7cf314e8 100755 --- a/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py +++ b/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py @@ -8,9 +8,8 @@ from pathlib import Path from typing import Any, TypedDict, get_args -from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts - -from .. import properties +from cdm_reader_mapper.common import collect_json_files, combine_dicts +from cdm_reader_mapper.mdf_reader import properties class SectionDict(TypedDict, total=False): @@ -123,8 +122,6 @@ def _resolve_schema_files( if ext_schema_path: schema_path = Path(ext_schema_path).resolve() path = schema_path / f"{schema_path.name}.json" - # print(path) - # exit() if not path.is_file(): raise FileNotFoundError(f"Can't find input schema path {ext_schema_path}") return [path] diff --git a/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py b/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py index 318770d2..3f857653 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py @@ -7,7 +7,8 @@ import pandas as pd -from .. import properties +from cdm_reader_mapper.mdf_reader import properties + from .utilities import convert_str_boolean @@ -263,6 +264,8 @@ def object_to_numeric( pd.Series Converted Series. """ + if data.dtype == "str": + data = data.astype(object) if data.dtype != "object": return data @@ -298,6 +301,8 @@ def object_to_object( pd.Series Cleaned Series. """ + if data.dtype == "str": + data = data.astype(object) if data.dtype != "object": return data @@ -308,7 +313,7 @@ def object_to_object( elif disable_white_strip == "r": data = data.str.lstrip() - return data.apply(lambda x: None if isinstance(x, str) and (x.isspace() or not x) else x) + return data.replace({"": pd.NA}) def object_to_datetime( self, @@ -332,6 +337,8 @@ def object_to_datetime( pd.Series Datetime Series. """ + if data.dtype == "str": + data = data.astype(object) if data.dtype != "object": return data diff --git a/src/cdm_reader_mapper/mdf_reader/utils/filereader.py b/src/cdm_reader_mapper/mdf_reader/utils/filereader.py index 6c2985ea..68f5ffc5 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/filereader.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/filereader.py @@ -13,10 +13,10 @@ import pandas as pd import xarray as xr -from cdm_reader_mapper.common.iterators import ProcessFunction, process_function +from cdm_reader_mapper.common import ProcessFunction, process_function, standardize_object_columns from cdm_reader_mapper.core.databundle import DataBundle +from cdm_reader_mapper.mdf_reader import properties -from .. import properties from .convert_and_decode import convert_and_decode from .parser import ( ParserConfig, @@ -279,6 +279,8 @@ def _process_data( for object_column in object_columns: data[object_column] = data[object_column].str.encode(config.encoding).str.decode("utf-8") + data = standardize_object_columns(data) + return data, mask, config @process_function() diff --git a/src/cdm_reader_mapper/mdf_reader/utils/parser.py b/src/cdm_reader_mapper/mdf_reader/utils/parser.py index 82d5d384..218b6eb8 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/parser.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/parser.py @@ -14,8 +14,9 @@ import pandas as pd import xarray as xr -from .. import properties -from ..schemas.schemas import SchemaDict, read_schema +from cdm_reader_mapper.mdf_reader import properties +from cdm_reader_mapper.mdf_reader.schemas.schemas import SchemaDict, read_schema + from .convert_and_decode import Converters, Decoders from .utilities import convert_dtypes diff --git a/src/cdm_reader_mapper/mdf_reader/utils/utilities.py b/src/cdm_reader_mapper/mdf_reader/utils/utilities.py index f34decac..c8f1c8b3 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/utilities.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/utilities.py @@ -9,7 +9,7 @@ import pandas as pd -from cdm_reader_mapper.common.iterators import ProcessFunction, process_function +from cdm_reader_mapper.common import ProcessFunction, process_function def as_list(x: str | Iterable[Any] | None) -> list[Any] | None: @@ -451,10 +451,12 @@ def convert_str_boolean(x: Any) -> Any: bool or original value True if 'True', False if 'False', else original value. """ + if pd.isna(x): + return x if x == "True": - x = True + return True if x == "False": - x = False + return False return x @@ -498,4 +500,5 @@ def remove_boolean_values(data: pd.DataFrame, dtypes: dict[str, str]) -> pd.Data """ data = data.map(_remove_boolean_values) dtype = _adjust_dtype(dtypes, data) + data = data.apply(lambda series: series.str.strip() if series.dtype == "str" else series) return data.astype(dtype) diff --git a/src/cdm_reader_mapper/mdf_reader/utils/validators.py b/src/cdm_reader_mapper/mdf_reader/utils/validators.py index a8f0ffdd..4c1888cc 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/validators.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/validators.py @@ -8,8 +8,9 @@ import numpy as np import pandas as pd -from .. import properties -from ..codes import codes +from cdm_reader_mapper.mdf_reader import properties +from cdm_reader_mapper.mdf_reader.codes import codes + from .utilities import convert_str_boolean @@ -237,9 +238,9 @@ def validate( # Explicit boolean literals ("True"/"False") override validation results if validated_columns: validated_columns = list(dict.fromkeys(validated_columns)) - to_bool = data[validated_columns].applymap(convert_str_boolean) - false_mask = to_bool.applymap(_is_false) - true_mask = to_bool.applymap(_is_true) + to_bool = data[validated_columns].apply(lambda col: col.map(convert_str_boolean)) + false_mask = to_bool.apply(lambda col: col.map(_is_false)) + true_mask = to_bool.apply(lambda col: col.map(_is_true)) mask[validated_columns] = mask[validated_columns].mask(false_mask, False) mask[validated_columns] = mask[validated_columns].mask(true_mask, True) diff --git a/src/cdm_reader_mapper/mdf_reader/writer.py b/src/cdm_reader_mapper/mdf_reader/writer.py index 6193a042..a16209c2 100755 --- a/src/cdm_reader_mapper/mdf_reader/writer.py +++ b/src/cdm_reader_mapper/mdf_reader/writer.py @@ -9,13 +9,14 @@ import pandas as pd -from ..common import get_filename -from ..common.iterators import ( +from cdm_reader_mapper.common import ( ParquetStreamReader, + get_filename, is_valid_iterator, parquet_stream_from_iterable, ) -from ..properties import SupportedFileTypes +from cdm_reader_mapper.properties import SupportedFileTypes + from .utils.utilities import join, update_column_names, update_dtypes diff --git a/src/cdm_reader_mapper/metmetpy/correct.py b/src/cdm_reader_mapper/metmetpy/correct.py index 64bba177..a9e5f37a 100755 --- a/src/cdm_reader_mapper/metmetpy/correct.py +++ b/src/cdm_reader_mapper/metmetpy/correct.py @@ -62,9 +62,8 @@ import pandas as pd -from ..common import logging_hdlr -from ..common.iterators import ProcessFunction, process_function -from ..common.json_dict import collect_json_files, combine_dicts +from cdm_reader_mapper.common import ProcessFunction, collect_json_files, combine_dicts, logging_hdlr, process_function + from . import properties from .datetime import correction_functions as corr_f_dt from .platform_type import correction_functions as corr_f_pt diff --git a/src/cdm_reader_mapper/metmetpy/datetime/correction_functions.py b/src/cdm_reader_mapper/metmetpy/datetime/correction_functions.py index 75ccdfc6..096893bb 100755 --- a/src/cdm_reader_mapper/metmetpy/datetime/correction_functions.py +++ b/src/cdm_reader_mapper/metmetpy/datetime/correction_functions.py @@ -10,7 +10,8 @@ import pandas as pd -from .. import properties +from cdm_reader_mapper.metmetpy import properties + from . import model_datetimes diff --git a/src/cdm_reader_mapper/metmetpy/datetime/model_datetimes.py b/src/cdm_reader_mapper/metmetpy/datetime/model_datetimes.py index 98db90ac..69224682 100755 --- a/src/cdm_reader_mapper/metmetpy/datetime/model_datetimes.py +++ b/src/cdm_reader_mapper/metmetpy/datetime/model_datetimes.py @@ -18,7 +18,7 @@ import numpy as np import pandas as pd -from .. import properties +from cdm_reader_mapper.metmetpy import properties def datetime_decimalhour_to_hm(decimal_hours: float) -> tuple[int, int]: diff --git a/src/cdm_reader_mapper/metmetpy/platform_type/correction_functions.py b/src/cdm_reader_mapper/metmetpy/platform_type/correction_functions.py index 32d0bd19..1137a2ba 100755 --- a/src/cdm_reader_mapper/metmetpy/platform_type/correction_functions.py +++ b/src/cdm_reader_mapper/metmetpy/platform_type/correction_functions.py @@ -12,7 +12,7 @@ import pandas as pd -from .. import properties +from cdm_reader_mapper.metmetpy import properties def is_num(x: Any) -> bool: diff --git a/src/cdm_reader_mapper/metmetpy/validate.py b/src/cdm_reader_mapper/metmetpy/validate.py index 16bc95f0..af1fc724 100755 --- a/src/cdm_reader_mapper/metmetpy/validate.py +++ b/src/cdm_reader_mapper/metmetpy/validate.py @@ -63,9 +63,8 @@ import pandas as pd -from ..common import logging_hdlr -from ..common.iterators import ProcessFunction, process_function -from ..common.json_dict import collect_json_files, combine_dicts +from cdm_reader_mapper.common import ProcessFunction, collect_json_files, combine_dicts, logging_hdlr, process_function + from . import properties from .datetime import model_datetimes diff --git a/tests/test_cdm_io.py b/tests/test_cdm_io.py index 01aacea2..1050ca2f 100755 --- a/tests/test_cdm_io.py +++ b/tests/test_cdm_io.py @@ -202,7 +202,9 @@ def test_read_data_csv(csv_path, example_data): data = bundle.data assert isinstance(data, pd.DataFrame) - pd.testing.assert_frame_equal(bundle.data, example_data.astype(str)) + exp_data = example_data.astype(str).astype(object) + + pd.testing.assert_frame_equal(bundle.data, exp_data) def test_read_data_single_csv(csv_path, example_data): @@ -214,7 +216,9 @@ def test_read_data_single_csv(csv_path, example_data): data = bundle.data assert isinstance(data, pd.DataFrame) - pd.testing.assert_frame_equal(bundle.data, example_data["observations-sst"].astype(str)) + exp_data = example_data["observations-sst"].astype(str).astype(object) + + pd.testing.assert_frame_equal(bundle.data, exp_data) def test_read_data_raises_data_format(csv_path): @@ -244,7 +248,10 @@ def test_read_data_parquet(parquet_path, example_data): data = bundle.data assert isinstance(data, pd.DataFrame) - pd.testing.assert_frame_equal(bundle.data, example_data) + exp_data = example_data.copy() + exp_data[("observations-sst", "B")] = exp_data[("observations-sst", "B")].astype(object) + + pd.testing.assert_frame_equal(bundle.data, exp_data) def test_read_data_feather(feather_path, example_data): @@ -256,7 +263,10 @@ def test_read_data_feather(feather_path, example_data): data = bundle.data assert isinstance(data, pd.DataFrame) - pd.testing.assert_frame_equal(bundle.data, example_data) + exp_data = example_data.copy() + exp_data[("observations-sst", "B")] = exp_data[("observations-sst", "B")].astype(object) + + pd.testing.assert_frame_equal(bundle.data, exp_data) def test_table_to_file_raises(csv_path, example_data): @@ -307,6 +317,7 @@ def test_read_tables_testdata_str_conversion(tmp_path): db_tmp = read_tables(tmp_path, suffix="str") expected = read_tables(source, extension="pq", to_str=True, imodel=imodel) + pd.testing.assert_frame_equal(db_tmp.data["header"], expected.data) diff --git a/tests/test_cdm_mapper.py b/tests/test_cdm_mapper.py index 66d2cb70..33774566 100755 --- a/tests/test_cdm_mapper.py +++ b/tests/test_cdm_mapper.py @@ -61,7 +61,7 @@ def data_header_expected(): ("header", "duplicate_status"): [4, 4, 4, 4], ("header", "platform_type"): [2, 33, 32, 45], ("header", "location_quality"): [2, 0, 0, 0], - ("header", "source_id"): [pd.NA, pd.NA, pd.NA, pd.NA], + ("header", "source_id"): [None, None, None, None], } ) return data.astype( @@ -493,39 +493,39 @@ def test_map_model_pub47(): ] dtypes = { - ("header", "station_name"): str, + ("header", "station_name"): "object", ("header", "platform_sub_type"): "Int64", - ("header", "primary_station_id"): str, + ("header", "primary_station_id"): "object", ("header", "station_record_number"): "Int64", ("header", "report_duration"): "Int64", ("observations-at", "sensor_automation_status"): "Int64", ("observations-at", "z_coordinate"): "Float64", ("observations-at", "observation_height_above_station_surface"): "Float64", - ("observations-at", "sensor_id"): str, + ("observations-at", "sensor_id"): "object", ("observations-dpt", "sensor_automation_status"): "Int64", ("observations-dpt", "z_coordinate"): "Float64", ("observations-dpt", "observation_height_above_station_surface"): "Float64", - ("observations-dpt", "sensor_id"): str, + ("observations-dpt", "sensor_id"): "object", ("observations-slp", "sensor_automation_status"): "Int64", ("observations-slp", "z_coordinate"): "Float64", ("observations-slp", "observation_height_above_station_surface"): "Float64", - ("observations-slp", "sensor_id"): str, + ("observations-slp", "sensor_id"): "object", ("observations-sst", "sensor_automation_status"): "Int64", ("observations-sst", "z_coordinate"): "Float64", ("observations-sst", "observation_height_above_station_surface"): "Float64", - ("observations-sst", "sensor_id"): str, + ("observations-sst", "sensor_id"): "object", ("observations-wbt", "sensor_automation_status"): "Int64", ("observations-wbt", "z_coordinate"): "Float64", ("observations-wbt", "observation_height_above_station_surface"): "Float64", - ("observations-wbt", "sensor_id"): str, + ("observations-wbt", "sensor_id"): "object", ("observations-wd", "sensor_automation_status"): "Int64", ("observations-wd", "z_coordinate"): "Float64", ("observations-wd", "observation_height_above_station_surface"): "Float64", - ("observations-wd", "sensor_id"): str, + ("observations-wd", "sensor_id"): "object", ("observations-ws", "sensor_automation_status"): "Int64", ("observations-ws", "z_coordinate"): "Float64", ("observations-ws", "observation_height_above_station_surface"): "Float64", - ("observations-ws", "sensor_id"): str, + ("observations-ws", "sensor_id"): "object", } result = result[columns] diff --git a/tests/test_common_select.py b/tests/test_common_select.py index 8cf10e3a..7c633271 100755 --- a/tests/test_common_select.py +++ b/tests/test_common_select.py @@ -291,6 +291,8 @@ def test_split_by_boolean_psr_true( ) exp = sample_psr.copy().read() + exp["B"] = exp["B"].astype(object) + exp_selected = exp.loc[exp_selected_idx] exp_rejected = exp.loc[exp_rejected_idx] @@ -369,6 +371,8 @@ def test_split_by_boolean_psr_false( ) exp = sample_psr.copy().read() + exp["B"] = exp["B"].astype(object) + exp_selected = exp.loc[exp_selected_idx] exp_rejected = exp.loc[exp_rejected_idx] diff --git a/tests/test_common_utils.py b/tests/test_common_utils.py index 1f5d5186..8d059279 100755 --- a/tests/test_common_utils.py +++ b/tests/test_common_utils.py @@ -8,6 +8,8 @@ from pathlib import Path from urllib.parse import urlparse +import numpy as np +import pandas as pd import pytest import requests @@ -28,6 +30,7 @@ open_json_file, ) from cdm_reader_mapper.common.logging_hdlr import init_logger +from cdm_reader_mapper.common.object_types import standardize_object_columns def compute_md5(content: bytes) -> str: @@ -481,3 +484,27 @@ def test_get_path_missing_file_module_not_found(tmp_path, caplog): assert result is None assert any("No module named" in msg for msg in caplog.messages) + + +def test_standardize_object_columns(): + df = pd.DataFrame( + { + "A": pd.Series(["x", "y", None], dtype="string"), + "B": pd.Series([1.1, 2.2, 3.3]), + "C": pd.Series([1, 2, 3]), + "D": pd.Series([True, False, np.nan]), + } + ) + + result = standardize_object_columns(df) + + expected = pd.DataFrame( + { + "A": pd.Series(["x", "y", None], dtype=object), + "B": pd.Series([1.1, 2.2, 3.3]), + "C": pd.Series([1, 2, 3]), + "D": pd.Series([True, False, None], dtype=object), + } + ) + + pd.testing.assert_frame_equal(result, expected) diff --git a/tests/test_databundle.py b/tests/test_databundle.py index 94eab1d1..c285f34b 100755 --- a/tests/test_databundle.py +++ b/tests/test_databundle.py @@ -940,7 +940,7 @@ def test_correct_pt_psr(): result = db.correct_pt() - expected = pd.DataFrame({PT: ["5", "7", "5"]}) + expected = pd.DataFrame({PT: ["5", "7", "5"]}, dtype="object") pd.testing.assert_frame_equal(result.data.read(), expected) diff --git a/tests/test_mapper_conversions.py b/tests/test_mapper_conversions.py index 2c2de3e0..d0a4cc67 100755 --- a/tests/test_mapper_conversions.py +++ b/tests/test_mapper_conversions.py @@ -97,7 +97,7 @@ def test_convert_array_general_to_str(exp, dtype, data): def test_convert_str_to_str(data, exp): series = pd.Series(data) result = _convert_str_to_str(series, "null") - expected = pd.Series(exp) + expected = pd.Series(exp, dtype=object) pd.testing.assert_series_equal(result, expected) @@ -164,8 +164,8 @@ def test_convert_str_array_from_str(data, exp): def test_convert_integer_to_str(data, exp): series = pd.Series(data) result = _convert_integer_to_str(series, "null") - expected_series = pd.Series(exp) - pd.testing.assert_series_equal(result, expected_series) + expected = pd.Series(exp, dtype=object) + pd.testing.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -238,7 +238,7 @@ def test_convert_integer_array_from_str(data, exp): def test_convert_float_to_str(data, decimal_places, exp): series = pd.Series(data) result = _convert_float_to_str(series, "null", decimal_places) - expected = pd.Series(exp) + expected = pd.Series(exp, dtype=object) pd.testing.assert_series_equal(result, expected) @@ -278,7 +278,7 @@ def test_convert_float_from_str(data, exp): def test_convert_float_array_to_str(data, exp): series = pd.Series(data) result = _convert_float_array_to_str(series, "null") - expected = pd.Series(exp, dtype=object) + expected = pd.Series(exp) pd.testing.assert_series_equal(result, expected) diff --git a/tests/test_mapping_functions.py b/tests/test_mapping_functions.py index f083ba7e..a914d655 100755 --- a/tests/test_mapping_functions.py +++ b/tests/test_mapping_functions.py @@ -311,10 +311,10 @@ def test_datetime_decimalhour_to_hm(row, expected_hr, expected_m): (pd.DataFrame([]), pd.DatetimeIndex([])), ], ) -def test_datetime_imma1(df, expected): +def test_datetime_imma1_base(df, expected): obj = MappingFunctions("dummy_model") result = obj.datetime_imma1(df) - pd.testing.assert_index_equal(result, expected) + pd.testing.assert_index_equal(result, expected.astype("datetime64[ns]")) @pytest.mark.parametrize( @@ -360,7 +360,7 @@ def test_datetime_imma1_to_utc(df, expected): expected_naive = expected.tz_localize(None) if expected.tz else expected - pd.testing.assert_index_equal(result, expected_naive) + pd.testing.assert_index_equal(result, expected_naive.astype("datetime64[ns]")) @pytest.mark.parametrize( @@ -394,7 +394,7 @@ def test_datetime_imma1_to_utc(df, expected): def test_datetime_imma1_701(df, expected): obj = MappingFunctions("dummy_model") result = obj.datetime_imma1_701(df) - pd.testing.assert_index_equal(result, expected) + pd.testing.assert_index_equal(result, expected.astype("datetime64[ns]")) @pytest.mark.parametrize( @@ -418,6 +418,7 @@ def test_datetime_imma1_701(df, expected): def test_datetime_immt(df, expected): obj = MappingFunctions("dummy_model") result = obj.datetime_immt(df) + expected = expected.astype("datetime64[ns]") pd.testing.assert_index_equal(result, expected) @@ -480,6 +481,7 @@ def test_datetime_craid(df, expected): def test_datetime_marob(df, expected): obj = MappingFunctions("dummy_model") result = obj.datetime_marob(df) + expected = expected.astype("datetime64[ns]") pd.testing.assert_series_equal(result, expected) @@ -756,13 +758,13 @@ def test_observing_programme(input_series, expected): @pytest.mark.parametrize( "input_series, prepend, append, separator, expected", [ - (pd.Series(["a", None, "c"]), "X", "Y", "", pd.Series(["XaY", None, "XcY"])), + (pd.Series(["a", None, "c"], dtype=object), "X", "Y", "", pd.Series(["XaY", None, "XcY"], dtype=object)), ( - pd.Series(["a", None, "c"]), + pd.Series(["a", None, "c"], dtype=object), "pre", "_app", "-", - pd.Series(["pre-a-_app", None, "pre-c-_app"]), + pd.Series(["pre-a-_app", None, "pre-c-_app"], dtype=object), ), (pd.Series([]), "X", "Y", "", pd.Series([], dtype=object)), ], @@ -770,7 +772,7 @@ def test_observing_programme(input_series, expected): def test_string_add(input_series, prepend, append, separator, expected): obj = MappingFunctions("dummy_model") result = obj.string_add(input_series, prepend=prepend, append=append, separator=separator) - pd.testing.assert_series_equal(result, expected) + pd.testing.assert_series_equal(result, expected.astype(object)) @pytest.mark.parametrize( @@ -783,7 +785,7 @@ def test_string_add(input_series, prepend, append, separator, expected): "-", None, None, - pd.Series(["X-1-3-Y", "X-2-4-Y"]), + pd.Series(["X-1-3-Y", "X-2-4-Y"], dtype=object), ), ( pd.DataFrame({"A": [1, 2], "B": [3, 4]}), @@ -792,7 +794,7 @@ def test_string_add(input_series, prepend, append, separator, expected): "-", [0, 1], [2, 2], - pd.Series(["X-01-03-Y", "X-02-04-Y"]), + pd.Series(["X-01-03-Y", "X-02-04-Y"], dtype=object), ), ( pd.DataFrame({"A": [1, 2], "B": [3, 4]}), @@ -801,7 +803,7 @@ def test_string_add(input_series, prepend, append, separator, expected): "-", None, None, - pd.Series(["1-3", "2-4"]), + pd.Series(["1-3", "2-4"], dtype=object), ), ( pd.DataFrame(columns=["A", "B"]), @@ -819,7 +821,7 @@ def test_string_add(input_series, prepend, append, separator, expected): ":", [0], [3], - pd.Series(["P:005:Q", "P:006:Q"]), + pd.Series(["P:005:Q", "P:006:Q"], dtype=object), ), ], ) @@ -954,7 +956,7 @@ def test_feet_to_m(input_series, expected): pd.DataFrame({"AAAA": [12], "MM": [3], "YY": [7], "GG": [5]}), "", "", - pd.Series(["a57ea24d0eb65ca390a63bd175c906db"], dtype="object"), + pd.Series(["a57ea24d0eb65ca390a63bd175c906db"], dtype=object), ), ( pd.DataFrame({"AAAA": [1, 2024], "MM": [1, 12], "YY": [1, 99], "GG": [1, 23]}), @@ -965,20 +967,20 @@ def test_feet_to_m(input_series, expected): "de3d414a8823554bbfde50f2305958d0", "5f4b0ac6560552bf9e69cc9de0541bd6", ], - dtype="object", + dtype=object, ), ), ( pd.DataFrame({"AAAA": [50], "MM": [6], "YY": [24], "GG": [4]}), "PRE-", "-POST", - pd.Series(["PRE-1d37cb121ceb546daba6431da61cd309-POST"], dtype="object"), + pd.Series(["PRE-1d37cb121ceb546daba6431da61cd309-POST"], dtype=object), ), ( pd.DataFrame({"AAAA": [], "MM": [], "YY": [], "GG": []}), "", "", - pd.Series([], dtype="object"), + pd.Series([], dtype=object), ), ], ) diff --git a/tests/test_mdf_reader.py b/tests/test_mdf_reader.py index 3ac303af..d1498854 100755 --- a/tests/test_mdf_reader.py +++ b/tests/test_mdf_reader.py @@ -43,6 +43,7 @@ def _read_mdf_test_data(data_model, select=None, drop=None, drop_idx=None, **kwa data = test_data[f"test_{data_model}"]["mdf_data"] mask = test_data[f"test_{data_model}"]["mdf_mask"] + print(data) expected = read_data(data_file=data, mask_file=mask) if not isinstance(result.data, pd.DataFrame): @@ -457,13 +458,15 @@ def test_validate_read_mdf_args_invalid_years(tmp_path): @pytest.fixture def example_data(): - return pd.DataFrame( + df = pd.DataFrame( { "A": [1, 2, 3], "B": [4.0, 5.0, 6.0], "C": ["x", "y", "z"], - } + }, ) + df = df.astype({"A": "int", "B": "float", "C": "object"}) + return df @pytest.fixture @@ -530,6 +533,7 @@ def feather_files(tmp_path, example_data, example_mask): def test_read_data_with_mask_csv(csv_files, example_data, example_mask, example_info): + print(example_data) data_file, mask_file, _ = csv_files data, mask, info = _read_data( data_file=data_file, diff --git a/tests/test_metmetpy.py b/tests/test_metmetpy.py index 746a1249..85f2a0f2 100755 --- a/tests/test_metmetpy.py +++ b/tests/test_metmetpy.py @@ -89,7 +89,7 @@ def test_icoads_to_datetime_basis(): ] ) ) - pd.testing.assert_series_equal(result, expected) + pd.testing.assert_series_equal(result, expected.astype("datetime64[ns]")) def test_icoads_to_datetime_missing_values(): @@ -113,7 +113,7 @@ def test_icoads_to_datetime_missing_values(): ] ) ) - pd.testing.assert_series_equal(result, expected) + pd.testing.assert_series_equal(result, expected.astype("datetime64[ns]")) def test_icoads_from_datetime_basis(): @@ -202,7 +202,9 @@ def test_icoads_to_datetime_missing_columns(): result = icoads(df, "to_datetime") - pd.testing.assert_series_equal(result, pd.Series(pd.to_datetime([None]))) + expected = pd.Series(pd.to_datetime([None])) + + pd.testing.assert_series_equal(result, expected.astype("datetime64[ns]")) def test_icoads_from_datetime_empty_series(): @@ -228,7 +230,7 @@ def test_to_datetime_basis(): expected = pd.Series(pd.to_datetime(["2000-01-10 12:30:00", "2001-02-15 06:15:00"])) - pd.testing.assert_series_equal(result, expected) + pd.testing.assert_series_equal(result, expected.astype("datetime64[ns]")) def test_to_datetime_no_correction(): @@ -894,7 +896,7 @@ def test_correct_pt_valid_iterable(): df2 = pd.DataFrame({PT: ["6", "7", None]}, index=[3, 4, 5]) result = correct_pt(ParquetStreamReader(iter([df1, df2])), "icoads_r300_d993") - exp = pd.DataFrame({PT: ["5", "7", "5", "6", "7", "5"]}) + exp = pd.DataFrame({PT: ["5", "7", "5", "6", "7", "5"]}, dtype="object") pd.testing.assert_frame_equal(result.read(), exp) diff --git a/tests/test_reader_convert_and_decode.py b/tests/test_reader_convert_and_decode.py index adbc3eb9..3a22b9a1 100755 --- a/tests/test_reader_convert_and_decode.py +++ b/tests/test_reader_convert_and_decode.py @@ -139,8 +139,7 @@ def test_object_to_object_strip(): series = pd.Series([" a ", "", " ", "b"]) result = conv.object_to_object(series) - - assert result.tolist() == ["a", None, None, "b"] + assert result.tolist() == ["a", pd.NA, pd.NA, "b"] def test_object_to_object_disable_lstrip(): diff --git a/tests/test_reader_utilities.py b/tests/test_reader_utilities.py index 31bdda23..e59f6ecf 100755 --- a/tests/test_reader_utilities.py +++ b/tests/test_reader_utilities.py @@ -231,12 +231,14 @@ def test_adjust_dtype(): assert _adjust_dtype("str", df) == "str" -def test_remove_boolean_values(): +def test_remove_boolean_values_sequence(): df = pd.DataFrame({"A": ["True", "False", "hello"], "B": [1, 2, 3]}) dtypes = {"A": "object", "B": "int"} result = remove_boolean_values(df, dtypes) - assert result.loc[0, "A"] is None - assert result.loc[1, "A"] is None + import numpy as np + + assert result.loc[0, "A"] is np.nan + assert result.loc[1, "A"] is np.nan assert result.loc[2, "A"] == "hello" assert result["B"].dtype.name == "int64" diff --git a/tests/test_writers.py b/tests/test_writers.py index 822369cd..6cfc6bbe 100755 --- a/tests/test_writers.py +++ b/tests/test_writers.py @@ -31,7 +31,6 @@ def db_data(): pattern = f"test_{imodel}" data_file = test_data[pattern]["mdf_data"] - db = read(data_file, mode="data") db.imodel = imodel return db @@ -40,13 +39,13 @@ def db_data(): def test_write_data_csv(tmp_path, db_data): db_data.write(out_dir=tmp_path, data_format="csv") tmppath = Path(tmp_path) - db_res = read( + db_exp = read( tmppath / "data.csv", info_file=tmppath / "info.json", data_format="csv", mode="data", ) - pd.testing.assert_frame_equal(db_data.data, db_res.data) + pd.testing.assert_frame_equal(db_data.data, db_exp.data) def test_write_tables_csv(tmp_path, db_tables):