diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 69bab7ad..698c0cee 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -10,7 +10,7 @@ Contributor to this version: Ludwig Lierhammer (:user:`ludwiglierhammer`)
Announcements
^^^^^^^^^^^^^
* `cdm_reader_mapper` now drops support for Python 3.10 (:pull:`419`)
-* `cdm_reader_mapper` now uses `cruft ` and the `Ouranosinc cookiecutter template ` (:issue:`369`, :pull:`419`)
+* `cdm_reader_mapper` now uses `cruft `_ and the `Ouranosinc cookiecutter template `_ (:issue:`369`, :pull:`419`)
New features and enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -20,7 +20,7 @@ Breaking changes
^^^^^^^^^^^^^^^^
* Development dependencies ("dev", "docs") are now installed via the new `dependency-groups` conventions (`PEP 735 `_) (:pull:`419`)
* `prek` is now the suggested pre-commit runner (installed by default via `pip install --group dev`) (:pull:`419`)
-* delete submodule ``src.cdm_reader_mapper.duplicates`` (:issue:`152`, :issue:`283`, :pull:`434`)
+* submodule ``src.cdm_reader_mapper.duplicates`` has been deleted and moved to `marine_qc `_ (:issue:`283`, :pull:`434`)
* ``cdm_reader_mapper.DupDetect`` is not importable anymore
* ``cdm_reader_mapper.duplicate_check`` is not importable anymore
@@ -30,7 +30,8 @@ Breaking changes
* ``cdm_reader_mapper.DataBundle.remove_duplicates`` is not callable anymore
* ``cdm_reader_mapper.DataBundle`` does not have attribute ``DupDetect`` anymore
-* submodule ``src.cdm_reader_mapper.duplicates`` has been moved to `marine_qc `_ (:issue:`283`, :pull:`434`)
+* the code has been updated to support pandas >= 3.0.0 (:pull:`435`)
+* reader functions now return object types instead of string types (:pull:`435`)
Internal changes
@@ -53,6 +54,8 @@ Internal changes
* remove helper class `core._utilities._DataBundle` and integrate it in `core.databundle.DataBundle` (:pull:`419`)
* make use of `pathlib.Path` instead of `os.path` (:pull:`419`)
* use consistently parameter "imodel" instead of "data_model" and "correction_method" instead of "fix_method" in `metmetpy` modules (:pull:`419`)
+* make some JSON and iterator utility functions directly importable in submodule ``cdm_reader_mapper.common`` (:pull:`435`)
+* replace relative imports with absolute imports for any submodules located one or more directories above the current submodule (:pull:`435`)
2.4.1 (2016-04-16)
diff --git a/environment-docs.yml b/environment-docs.yml
index e67b4c5b..b9676e4f 100755
--- a/environment-docs.yml
+++ b/environment-docs.yml
@@ -12,7 +12,7 @@ dependencies:
- h5netcdf >=1.6.1
- h5py >=3.13.0
- numpy >=2.1.3
- - pandas >=2.2.0
+ - pandas >=3.0.0
- pyarrow >=15.0.0 # Strongly encouraged for Pandas v2.2.0+
- timezonefinder >6.5.0
- xarray >=2023.11.0,!=2024.10.0
diff --git a/pyproject.toml b/pyproject.toml
index 22dab2e4..f4e4acfc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,7 +93,7 @@ dependencies = [
"h5netcdf >=1.6.1",
"h5py >= 3.13.0",
"numpy>=2.1.3",
- "pandas>=2.2.0",
+ "pandas>=3.0.0",
"platformdirs >4.0.0",
"pyarrow >=15.0.0",
"requests",
diff --git a/src/cdm_reader_mapper/cdm_mapper/codes/codes.py b/src/cdm_reader_mapper/cdm_mapper/codes/codes.py
index e3388a67..a9a0c058 100755
--- a/src/cdm_reader_mapper/cdm_mapper/codes/codes.py
+++ b/src/cdm_reader_mapper/cdm_mapper/codes/codes.py
@@ -13,14 +13,13 @@
from pathlib import Path
from typing import Any
-from cdm_reader_mapper.common.json_dict import (
+from cdm_reader_mapper.cdm_mapper import properties
+from cdm_reader_mapper.common import (
collect_json_files,
combine_dicts,
open_json_file,
)
-from .. import properties
-
def _eval(s: str) -> Any:
"""
diff --git a/src/cdm_reader_mapper/cdm_mapper/mapper.py b/src/cdm_reader_mapper/cdm_mapper/mapper.py
index 712823db..3a8f28ab 100755
--- a/src/cdm_reader_mapper/cdm_mapper/mapper.py
+++ b/src/cdm_reader_mapper/cdm_mapper/mapper.py
@@ -18,11 +18,12 @@
import pandas as pd
-from cdm_reader_mapper.common import logging_hdlr
-from cdm_reader_mapper.common.iterators import (
+from cdm_reader_mapper.common import (
ParquetStreamReader,
ProcessFunction,
+ logging_hdlr,
process_function,
+ standardize_object_columns,
)
from . import properties
@@ -65,14 +66,14 @@ def _drop_duplicated_rows(df: pd.DataFrame) -> pd.DataFrame:
Parameters
----------
df : pd.DataFrame
- Input DataFrame to drop duplictaed rows.
+ Input DataFrame to drop duplicated rows.
Returns
-------
pd.DataFrame
Input DataFrame with deleted duplicated rows.
"""
- list_cols = [col for col in df.columns if df[col].apply(lambda x: isinstance(x, list)).any()]
+ list_cols = [col for col in df.columns if df[col].dtype == "object" and df[col].apply(lambda x: isinstance(x, list)).any()]
for col in list_cols:
df[col] = df[col].apply(lambda x: tuple(x) if isinstance(x, list) else x)
@@ -481,7 +482,7 @@ def _table_mapping(
if drop_duplicates:
table_df = _drop_duplicated_rows(table_df)
- return table_df
+ return standardize_object_columns(table_df)
def _prepare_cdm_tables(cdm_subset: str | Sequence[str]) -> dict[str, Any]:
diff --git a/src/cdm_reader_mapper/cdm_mapper/properties.py b/src/cdm_reader_mapper/cdm_mapper/properties.py
index f122ca61..bcf62394 100755
--- a/src/cdm_reader_mapper/cdm_mapper/properties.py
+++ b/src/cdm_reader_mapper/cdm_mapper/properties.py
@@ -2,7 +2,7 @@
from __future__ import annotations
-from ..properties import SupportedDataModels, SupportedFileTypes
+from cdm_reader_mapper.properties import SupportedDataModels, SupportedFileTypes
__all__ = [
@@ -27,7 +27,7 @@
"observations-slp",
]
-# ...from CDM table definitions psuedo-sql(...) --------------------------------
+# ...from CDM table definitions pseudo-sql(...) --------------------------------
pandas_dtypes: dict[str, dict[str, str]] = {}
pandas_dtypes["from_sql"] = {}
pandas_dtypes["from_sql"]["timestamp with timezone"] = "object"
diff --git a/src/cdm_reader_mapper/cdm_mapper/reader.py b/src/cdm_reader_mapper/cdm_mapper/reader.py
index 8d5b0837..45282b9c 100755
--- a/src/cdm_reader_mapper/cdm_mapper/reader.py
+++ b/src/cdm_reader_mapper/cdm_mapper/reader.py
@@ -51,11 +51,10 @@
import pandas as pd
-from cdm_reader_mapper.common import get_filename, logging_hdlr
+from cdm_reader_mapper.common import get_filename, logging_hdlr, standardize_object_columns
from cdm_reader_mapper.core.databundle import DataBundle
-from ..properties import SupportedFileTypes
-from .properties import cdm_tables
+from .properties import SupportedFileTypes, cdm_tables
from .utils.conversions import convert_from_str_df, convert_to_str_df
from .utils.utilities import get_cdm_subset, get_usecols
@@ -439,4 +438,6 @@ def read_tables(
elif to_str is True:
merged = convert_to_str_df(merged, imodel, cdm_subset=cdm_subset)
- return DataBundle(data=merged, columns=merged.columns, mode="tables")
+ data = standardize_object_columns(merged)
+
+ return DataBundle(data=data, columns=merged.columns, mode="tables")
diff --git a/src/cdm_reader_mapper/cdm_mapper/tables/tables.py b/src/cdm_reader_mapper/cdm_mapper/tables/tables.py
index 8c58395e..a9a44742 100755
--- a/src/cdm_reader_mapper/cdm_mapper/tables/tables.py
+++ b/src/cdm_reader_mapper/cdm_mapper/tables/tables.py
@@ -15,14 +15,13 @@
from pathlib import Path
from typing import Any
-from cdm_reader_mapper.common.json_dict import (
+from cdm_reader_mapper.cdm_mapper import properties
+from cdm_reader_mapper.common import (
collect_json_files,
combine_dicts,
open_json_file,
)
-from .. import properties
-
def get_cdm_atts(
cdm_tables: str | tuple[str, str] | Sequence[str | tuple[str, str]] | None = None,
diff --git a/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py b/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py
index c919d364..32ade61b 100755
--- a/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py
+++ b/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py
@@ -8,8 +8,8 @@
import numpy as np
import pandas as pd
-from .. import properties
-from ..tables.tables import get_cdm_atts, get_imodel_maps
+from cdm_reader_mapper.cdm_mapper import properties
+from cdm_reader_mapper.cdm_mapper.tables.tables import get_cdm_atts, get_imodel_maps
class BaseConverter:
@@ -269,7 +269,7 @@ def _convert_value(x: Any) -> str:
return "{" + ",".join(str_list) + "}"
- return data.apply(_convert_value).astype(object)
+ return data.apply(_convert_value)
def _convert_str_to_str(data: pd.Series, null_label: str) -> pd.Series:
@@ -311,7 +311,7 @@ def _return_str(x: Any, null_label: str) -> str:
return null_label
return str(x)
- return data.apply(lambda x: _return_str(x, null_label))
+ return data.apply(lambda x: _return_str(x, null_label)).astype(object)
def _convert_str_from_str(data: pd.Series, null_label: str) -> pd.Series:
@@ -372,7 +372,7 @@ def _convert_str_array_to_str(data: pd.Series, null_label: str) -> pd.Series:
pd.Series
Series with integer arrays in "{...}" format.
"""
- return _convert_array_general_to_str(data, null_label, dtype=str)
+ return _convert_array_general_to_str(data, null_label, dtype=object)
def _convert_str_array_from_str(data: pd.Series, null_label: str) -> pd.Series:
@@ -662,7 +662,10 @@ def _convert_datetime_from_str(data: pd.Series, null_label: str) -> pd.Series:
Series with values converted to pandas datetime dtype (datetime64[ns]).
Invalid or non-convertible values are set to NaT.
"""
- return pd.to_datetime(data, errors="coerce")
+ dt = pd.to_datetime(data, errors="coerce")
+ if dt.dt.tz is not None:
+ dt = dt.dt.tz_localize(None)
+ return dt.astype("datetime64[ns]")
def _convert_column(
@@ -832,6 +835,7 @@ def _convert_columns(
null_label,
converters,
)
+
return data
diff --git a/src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py b/src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py
index ef5abcd6..fe74b193 100755
--- a/src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py
+++ b/src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py
@@ -369,32 +369,31 @@ def datetime_imma1(self, df: pd.DataFrame) -> pd.DatetimeIndex:
DatetimeIndex of converted timestamps.
"""
if df.empty:
- return pd.DatetimeIndex([])
+ return pd.DatetimeIndex([], dtype="datetime64[ns]")
df = df.iloc[:, 0:4]
date_format = "%Y-%m-%d-%H-%M"
hr_ = df.columns[-1]
df = df.assign(HR=df.iloc[:, -1])
df["M"] = df["HR"].copy()
- df = df.drop(columns=hr_, axis=1)
-
+ df = df.drop(columns=hr_)
hr_min = df.apply(lambda x: self.datetime_decimalhour_to_hm(x), axis=1)
df["HR"] = hr_min["HR"]
df["M"] = hr_min["M"]
df = df.apply(lambda col: col.map(to_int))
- strings = df.astype(str).apply("-".join, axis=1).values
+ strings = df.apply(lambda row: "-".join(map(str, row)), axis=1).values
result = pd.to_datetime(
strings,
format=date_format,
errors="coerce",
)
result.index = df.index
- return result
+ return result.astype("datetime64[ns]")
def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex:
"""
- Convert to pandas datetime object for IMMA1 deck 701 format.
+ Convert to pandas datetime object to UTC time.
Set missing hour to 12 and use latitude and longitude information
to convert local midday to UTC time.
@@ -402,7 +401,7 @@ def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex:
Parameters
----------
df : pd.DataFrame
- IMMA1 deck 701 dataset containing year, month, day, latitude, and longitude.
+ IMMA1 dataset containing year, month, day, latitude, and longitude.
Returns
-------
@@ -410,7 +409,7 @@ def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex:
DatetimeIndex with timestamps converted to UTC.
"""
if df.empty:
- return pd.DatetimeIndex([])
+ return pd.DatetimeIndex([], dtype="datetime64[ns]")
date_format = "%Y-%m-%d-%H-%M"
@@ -437,7 +436,7 @@ def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex:
results = df_time.apply(lambda x: convert_to_utc_i(x["Dates"], x["Time_zone"]), axis=1)
results.index = df.index
- return pd.DatetimeIndex(results.dt.tz_convert(None))
+ return pd.DatetimeIndex(results.dt.tz_convert(None), dtype="datetime64[ns]")
def datetime_imma1_701(self, df: pd.DataFrame) -> pd.DatetimeIndex:
"""
@@ -454,7 +453,7 @@ def datetime_imma1_701(self, df: pd.DataFrame) -> pd.DatetimeIndex:
DatetimeIndex with converted timestamps.
"""
if df.empty:
- return pd.DatetimeIndex([])
+ return pd.DatetimeIndex([], dtype="datetime64[ns]")
hr = df.iloc[:, 3]
valid_mask = hr.notna()
@@ -484,7 +483,7 @@ def datetime_immt(self, df: pd.DataFrame) -> pd.DatetimeIndex:
DatetimeIndex of converted timestamps.
"""
if df.empty:
- return pd.DatetimeIndex([])
+ return pd.DatetimeIndex([], dtype="datetime64[ns]")
date_format = "%Y-%m-%d-%H-%M"
df = df.copy()
@@ -496,7 +495,7 @@ def datetime_immt(self, df: pd.DataFrame) -> pd.DatetimeIndex:
format=date_format,
errors="coerce",
)
- return pd.DatetimeIndex(result)
+ return pd.DatetimeIndex(result).astype("datetime64[ns]")
def datetime_utcnow(self, df: pd.DataFrame) -> datetime.datetime:
"""
@@ -566,7 +565,7 @@ def datetime_marob(self, series: pd.Series, format: str = "%Y-%m-%dT%H:%M:%S") -
pd.Series
Series of converted dates.
"""
- return series_strptime(series, format)
+ return series_strptime(series, format).astype("datetime64[ns]")
def df_col_join(self, df: pd.DataFrame, sep: str) -> pd.Series:
"""
@@ -819,7 +818,6 @@ def string_add(
Series with modified string values.
"""
result = np.vectorize(string_add_i, otypes="O")(prepend, series, append, separator)
-
return pd.Series(result, index=series.index, dtype="object")
def string_join_add(
@@ -1016,7 +1014,7 @@ def gdac_uid(self, df: pd.DataFrame, prepend: str = "", append: str = "") -> pd.
for i, n in enumerate(name):
uid[i] = str(prepend) + uuid.uuid5(uuid.NAMESPACE_OID, str(n)).hex + str(append)
df["UUID"] = uid
- return df["UUID"]
+ return df["UUID"].astype(object)
def gdac_latitude(self, df: pd.DataFrame) -> pd.Series:
"""
diff --git a/src/cdm_reader_mapper/cdm_mapper/utils/utilities.py b/src/cdm_reader_mapper/cdm_mapper/utils/utilities.py
index 6f405cbd..21529665 100755
--- a/src/cdm_reader_mapper/cdm_mapper/utils/utilities.py
+++ b/src/cdm_reader_mapper/cdm_mapper/utils/utilities.py
@@ -4,7 +4,7 @@
from collections.abc import Iterable
from typing import Any
-from .. import properties
+from cdm_reader_mapper.cdm_mapper import properties
def dict_to_tuple_list(dic: dict[Any, Any]) -> list[tuple[Any, Any]]:
diff --git a/src/cdm_reader_mapper/cdm_mapper/writer.py b/src/cdm_reader_mapper/cdm_mapper/writer.py
index 64288e29..1ece36fc 100755
--- a/src/cdm_reader_mapper/cdm_mapper/writer.py
+++ b/src/cdm_reader_mapper/cdm_mapper/writer.py
@@ -29,7 +29,7 @@
from cdm_reader_mapper.common import get_filename, logging_hdlr
-from ..properties import SupportedFileTypes
+from .properties import SupportedFileTypes
from .tables.tables import get_cdm_atts
from .utils.conversions import convert_from_str_df, convert_to_str_df
from .utils.utilities import adjust_filename, dict_to_tuple_list, get_cdm_subset
diff --git a/src/cdm_reader_mapper/common/__init__.py b/src/cdm_reader_mapper/common/__init__.py
index 22a68549..ff175e2f 100755
--- a/src/cdm_reader_mapper/common/__init__.py
+++ b/src/cdm_reader_mapper/common/__init__.py
@@ -2,10 +2,12 @@
from __future__ import annotations
-from . import json_dict
from .getting_files import load_file
from .inspect import count_by_cat, get_length
from .io_files import get_filename
+from .iterators import ParquetStreamReader, ProcessFunction, is_valid_iterator, parquet_stream_from_iterable, process_disk_backed, process_function
+from .json_dict import collect_json_files, combine_dicts, open_json_file
+from .object_types import standardize_object_columns
from .replace import replace_columns
from .select import (
split_by_boolean,
@@ -17,15 +19,24 @@
__all__ = [
+ "ParquetStreamReader",
+ "ProcessFunction",
+ "collect_json_files",
+ "combine_dicts",
"count_by_cat",
"get_filename",
"get_length",
- "json_dict",
+ "is_valid_iterator",
"load_file",
+ "open_json_file",
+ "parquet_stream_from_iterable",
+ "process_disk_backed",
+ "process_function",
"replace_columns",
"split_by_boolean",
"split_by_boolean_false",
"split_by_boolean_true",
"split_by_column_entries",
"split_by_index",
+ "standardize_object_columns",
]
diff --git a/src/cdm_reader_mapper/common/iterators.py b/src/cdm_reader_mapper/common/iterators.py
index de1bb350..94e97bb2 100755
--- a/src/cdm_reader_mapper/common/iterators.py
+++ b/src/cdm_reader_mapper/common/iterators.py
@@ -18,6 +18,8 @@
import pyarrow.parquet as pq
import xarray as xr
+from .object_types import standardize_object_columns
+
class ProcessFunction:
r"""
@@ -420,6 +422,9 @@ def __exit__(self, _exc_type: type | None, _exc_val: BaseException | None, _exc_
self.close()
+ParquetStreamReader.__module__ = "cdm_reader_mapper.common"
+
+
def _sort_chunk_outputs(
outputs: tuple[Any, ...], capture_meta: bool, requested_types: tuple[type, ...]
) -> tuple[list[pd.DataFrame | pd.Series], list[Any]]:
@@ -558,6 +563,8 @@ def _parquet_generator(temp_dir: TemporaryDirectory[str], data_type: type, schem
for f in files:
df = pd.read_parquet(f)
+ df = standardize_object_columns(df)
+
if data_type is pd.Series:
s = df.iloc[:, 0].copy()
s.name = schema
diff --git a/src/cdm_reader_mapper/common/object_types.py b/src/cdm_reader_mapper/common/object_types.py
new file mode 100755
index 00000000..afe4e18e
--- /dev/null
+++ b/src/cdm_reader_mapper/common/object_types.py
@@ -0,0 +1,27 @@
+"""Utility function for reading and writing files."""
+
+from __future__ import annotations
+
+import pandas as pd
+
+
+def standardize_object_columns(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Convert string columns to object dtype and replace NaNs with None.
+
+ Parameters
+ ----------
+ df : pd.DataFrame
+ The input DataFrame to be standardized.
+
+ Returns
+ -------
+ pd.DataFrame
+ The same DataFrame instance after the dtype conversion and NaN handling.
+ """
+ df = df.copy()
+ string_cols = df.select_dtypes(include="string").columns
+ df[string_cols] = df[string_cols].astype(object)
+ object_cols = df.select_dtypes(include="object").columns
+ df[object_cols] = df[object_cols].fillna(None)
+ return df
diff --git a/src/cdm_reader_mapper/core/_utilities.py b/src/cdm_reader_mapper/core/_utilities.py
index 7c6dff74..fdffc430 100755
--- a/src/cdm_reader_mapper/core/_utilities.py
+++ b/src/cdm_reader_mapper/core/_utilities.py
@@ -8,7 +8,7 @@
import numpy as np
import pandas as pd
-from cdm_reader_mapper.common.iterators import (
+from cdm_reader_mapper.common import (
ParquetStreamReader,
is_valid_iterator,
parquet_stream_from_iterable,
diff --git a/src/cdm_reader_mapper/core/databundle.py b/src/cdm_reader_mapper/core/databundle.py
index 4b3f20a8..985724c7 100755
--- a/src/cdm_reader_mapper/core/databundle.py
+++ b/src/cdm_reader_mapper/core/databundle.py
@@ -8,15 +8,16 @@
from cdm_reader_mapper.cdm_mapper.mapper import map_model
from cdm_reader_mapper.common import (
+ ParquetStreamReader,
count_by_cat,
get_length,
+ is_valid_iterator,
replace_columns,
split_by_boolean_false,
split_by_boolean_true,
split_by_column_entries,
split_by_index,
)
-from cdm_reader_mapper.common.iterators import ParquetStreamReader, is_valid_iterator
from cdm_reader_mapper.metmetpy import (
correct_datetime,
correct_pt,
diff --git a/src/cdm_reader_mapper/core/reader.py b/src/cdm_reader_mapper/core/reader.py
index a7d0efa9..d2ca72c6 100755
--- a/src/cdm_reader_mapper/core/reader.py
+++ b/src/cdm_reader_mapper/core/reader.py
@@ -6,8 +6,8 @@
from cdm_reader_mapper.cdm_mapper.reader import read_tables
from cdm_reader_mapper.mdf_reader.reader import read_data, read_mdf
+from cdm_reader_mapper.properties import SupportedReadModes
-from ..properties import SupportedReadModes
from .databundle import DataBundle
diff --git a/src/cdm_reader_mapper/core/writer.py b/src/cdm_reader_mapper/core/writer.py
index 49dba472..097217e2 100755
--- a/src/cdm_reader_mapper/core/writer.py
+++ b/src/cdm_reader_mapper/core/writer.py
@@ -8,8 +8,7 @@
from cdm_reader_mapper.cdm_mapper.writer import write_tables
from cdm_reader_mapper.mdf_reader.writer import write_data
-
-from ..properties import SupportedWriteModes
+from cdm_reader_mapper.properties import SupportedWriteModes
supported_write_modes = get_args(SupportedWriteModes)
diff --git a/src/cdm_reader_mapper/mdf_reader/codes/codes.py b/src/cdm_reader_mapper/mdf_reader/codes/codes.py
index a8786913..0501f166 100755
--- a/src/cdm_reader_mapper/mdf_reader/codes/codes.py
+++ b/src/cdm_reader_mapper/mdf_reader/codes/codes.py
@@ -8,13 +8,12 @@
from pathlib import Path
from typing import Any
-from cdm_reader_mapper.common.json_dict import (
+from cdm_reader_mapper.common import (
collect_json_files,
combine_dicts,
open_json_file,
)
-
-from .. import properties
+from cdm_reader_mapper.mdf_reader import properties
def read_table(
diff --git a/src/cdm_reader_mapper/mdf_reader/properties.py b/src/cdm_reader_mapper/mdf_reader/properties.py
index 1cb9ade1..a934a368 100755
--- a/src/cdm_reader_mapper/mdf_reader/properties.py
+++ b/src/cdm_reader_mapper/mdf_reader/properties.py
@@ -3,7 +3,7 @@
from __future__ import annotations
from typing import get_args
-from ..properties import NumericTypes, ObjectTypes, SupportedDataModels
+from cdm_reader_mapper.properties import NumericTypes, ObjectTypes, SupportedDataModels
__all__ = [
diff --git a/src/cdm_reader_mapper/mdf_reader/reader.py b/src/cdm_reader_mapper/mdf_reader/reader.py
index 0c85d4c3..ec494e0f 100755
--- a/src/cdm_reader_mapper/mdf_reader/reader.py
+++ b/src/cdm_reader_mapper/mdf_reader/reader.py
@@ -8,9 +8,9 @@
import pandas as pd
from cdm_reader_mapper import DataBundle
+from cdm_reader_mapper.common import open_json_file, standardize_object_columns
+from cdm_reader_mapper.properties import SupportedFileTypes
-from ..common.json_dict import open_json_file
-from ..properties import SupportedFileTypes
from .utils.filereader import FileReader
from .utils.utilities import as_list, as_path, read_csv, read_feather, read_parquet, validate_arg
@@ -302,6 +302,11 @@ def _read_data(
**mask_kwargs,
)
+ data = standardize_object_columns(data)
+
+ if "dtypes" in info:
+ info["dtypes"] = info["dtypes"].replace("str", "object")
+
return data, mask, info
diff --git a/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py b/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py
index 6ae0e845..7cf314e8 100755
--- a/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py
+++ b/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py
@@ -8,9 +8,8 @@
from pathlib import Path
from typing import Any, TypedDict, get_args
-from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts
-
-from .. import properties
+from cdm_reader_mapper.common import collect_json_files, combine_dicts
+from cdm_reader_mapper.mdf_reader import properties
class SectionDict(TypedDict, total=False):
@@ -123,8 +122,6 @@ def _resolve_schema_files(
if ext_schema_path:
schema_path = Path(ext_schema_path).resolve()
path = schema_path / f"{schema_path.name}.json"
- # print(path)
- # exit()
if not path.is_file():
raise FileNotFoundError(f"Can't find input schema path {ext_schema_path}")
return [path]
diff --git a/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py b/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py
index 318770d2..3f857653 100755
--- a/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py
+++ b/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py
@@ -7,7 +7,8 @@
import pandas as pd
-from .. import properties
+from cdm_reader_mapper.mdf_reader import properties
+
from .utilities import convert_str_boolean
@@ -263,6 +264,8 @@ def object_to_numeric(
pd.Series
Converted Series.
"""
+ if data.dtype == "str":
+ data = data.astype(object)
if data.dtype != "object":
return data
@@ -298,6 +301,8 @@ def object_to_object(
pd.Series
Cleaned Series.
"""
+ if data.dtype == "str":
+ data = data.astype(object)
if data.dtype != "object":
return data
@@ -308,7 +313,7 @@ def object_to_object(
elif disable_white_strip == "r":
data = data.str.lstrip()
- return data.apply(lambda x: None if isinstance(x, str) and (x.isspace() or not x) else x)
+ return data.replace({"": pd.NA})
def object_to_datetime(
self,
@@ -332,6 +337,8 @@ def object_to_datetime(
pd.Series
Datetime Series.
"""
+ if data.dtype == "str":
+ data = data.astype(object)
if data.dtype != "object":
return data
diff --git a/src/cdm_reader_mapper/mdf_reader/utils/filereader.py b/src/cdm_reader_mapper/mdf_reader/utils/filereader.py
index 6c2985ea..68f5ffc5 100755
--- a/src/cdm_reader_mapper/mdf_reader/utils/filereader.py
+++ b/src/cdm_reader_mapper/mdf_reader/utils/filereader.py
@@ -13,10 +13,10 @@
import pandas as pd
import xarray as xr
-from cdm_reader_mapper.common.iterators import ProcessFunction, process_function
+from cdm_reader_mapper.common import ProcessFunction, process_function, standardize_object_columns
from cdm_reader_mapper.core.databundle import DataBundle
+from cdm_reader_mapper.mdf_reader import properties
-from .. import properties
from .convert_and_decode import convert_and_decode
from .parser import (
ParserConfig,
@@ -279,6 +279,8 @@ def _process_data(
for object_column in object_columns:
data[object_column] = data[object_column].str.encode(config.encoding).str.decode("utf-8")
+ data = standardize_object_columns(data)
+
return data, mask, config
@process_function()
diff --git a/src/cdm_reader_mapper/mdf_reader/utils/parser.py b/src/cdm_reader_mapper/mdf_reader/utils/parser.py
index 82d5d384..218b6eb8 100755
--- a/src/cdm_reader_mapper/mdf_reader/utils/parser.py
+++ b/src/cdm_reader_mapper/mdf_reader/utils/parser.py
@@ -14,8 +14,9 @@
import pandas as pd
import xarray as xr
-from .. import properties
-from ..schemas.schemas import SchemaDict, read_schema
+from cdm_reader_mapper.mdf_reader import properties
+from cdm_reader_mapper.mdf_reader.schemas.schemas import SchemaDict, read_schema
+
from .convert_and_decode import Converters, Decoders
from .utilities import convert_dtypes
diff --git a/src/cdm_reader_mapper/mdf_reader/utils/utilities.py b/src/cdm_reader_mapper/mdf_reader/utils/utilities.py
index f34decac..c8f1c8b3 100755
--- a/src/cdm_reader_mapper/mdf_reader/utils/utilities.py
+++ b/src/cdm_reader_mapper/mdf_reader/utils/utilities.py
@@ -9,7 +9,7 @@
import pandas as pd
-from cdm_reader_mapper.common.iterators import ProcessFunction, process_function
+from cdm_reader_mapper.common import ProcessFunction, process_function
def as_list(x: str | Iterable[Any] | None) -> list[Any] | None:
@@ -451,10 +451,12 @@ def convert_str_boolean(x: Any) -> Any:
bool or original value
True if 'True', False if 'False', else original value.
"""
+ if pd.isna(x):
+ return x
if x == "True":
- x = True
+ return True
if x == "False":
- x = False
+ return False
return x
@@ -498,4 +500,5 @@ def remove_boolean_values(data: pd.DataFrame, dtypes: dict[str, str]) -> pd.Data
"""
data = data.map(_remove_boolean_values)
dtype = _adjust_dtype(dtypes, data)
+ data = data.apply(lambda series: series.str.strip() if series.dtype == "str" else series)
return data.astype(dtype)
diff --git a/src/cdm_reader_mapper/mdf_reader/utils/validators.py b/src/cdm_reader_mapper/mdf_reader/utils/validators.py
index a8f0ffdd..4c1888cc 100755
--- a/src/cdm_reader_mapper/mdf_reader/utils/validators.py
+++ b/src/cdm_reader_mapper/mdf_reader/utils/validators.py
@@ -8,8 +8,9 @@
import numpy as np
import pandas as pd
-from .. import properties
-from ..codes import codes
+from cdm_reader_mapper.mdf_reader import properties
+from cdm_reader_mapper.mdf_reader.codes import codes
+
from .utilities import convert_str_boolean
@@ -237,9 +238,9 @@ def validate(
# Explicit boolean literals ("True"/"False") override validation results
if validated_columns:
validated_columns = list(dict.fromkeys(validated_columns))
- to_bool = data[validated_columns].applymap(convert_str_boolean)
- false_mask = to_bool.applymap(_is_false)
- true_mask = to_bool.applymap(_is_true)
+ to_bool = data[validated_columns].apply(lambda col: col.map(convert_str_boolean))
+ false_mask = to_bool.apply(lambda col: col.map(_is_false))
+ true_mask = to_bool.apply(lambda col: col.map(_is_true))
mask[validated_columns] = mask[validated_columns].mask(false_mask, False)
mask[validated_columns] = mask[validated_columns].mask(true_mask, True)
diff --git a/src/cdm_reader_mapper/mdf_reader/writer.py b/src/cdm_reader_mapper/mdf_reader/writer.py
index 6193a042..a16209c2 100755
--- a/src/cdm_reader_mapper/mdf_reader/writer.py
+++ b/src/cdm_reader_mapper/mdf_reader/writer.py
@@ -9,13 +9,14 @@
import pandas as pd
-from ..common import get_filename
-from ..common.iterators import (
+from cdm_reader_mapper.common import (
ParquetStreamReader,
+ get_filename,
is_valid_iterator,
parquet_stream_from_iterable,
)
-from ..properties import SupportedFileTypes
+from cdm_reader_mapper.properties import SupportedFileTypes
+
from .utils.utilities import join, update_column_names, update_dtypes
diff --git a/src/cdm_reader_mapper/metmetpy/correct.py b/src/cdm_reader_mapper/metmetpy/correct.py
index 64bba177..a9e5f37a 100755
--- a/src/cdm_reader_mapper/metmetpy/correct.py
+++ b/src/cdm_reader_mapper/metmetpy/correct.py
@@ -62,9 +62,8 @@
import pandas as pd
-from ..common import logging_hdlr
-from ..common.iterators import ProcessFunction, process_function
-from ..common.json_dict import collect_json_files, combine_dicts
+from cdm_reader_mapper.common import ProcessFunction, collect_json_files, combine_dicts, logging_hdlr, process_function
+
from . import properties
from .datetime import correction_functions as corr_f_dt
from .platform_type import correction_functions as corr_f_pt
diff --git a/src/cdm_reader_mapper/metmetpy/datetime/correction_functions.py b/src/cdm_reader_mapper/metmetpy/datetime/correction_functions.py
index 75ccdfc6..096893bb 100755
--- a/src/cdm_reader_mapper/metmetpy/datetime/correction_functions.py
+++ b/src/cdm_reader_mapper/metmetpy/datetime/correction_functions.py
@@ -10,7 +10,8 @@
import pandas as pd
-from .. import properties
+from cdm_reader_mapper.metmetpy import properties
+
from . import model_datetimes
diff --git a/src/cdm_reader_mapper/metmetpy/datetime/model_datetimes.py b/src/cdm_reader_mapper/metmetpy/datetime/model_datetimes.py
index 98db90ac..69224682 100755
--- a/src/cdm_reader_mapper/metmetpy/datetime/model_datetimes.py
+++ b/src/cdm_reader_mapper/metmetpy/datetime/model_datetimes.py
@@ -18,7 +18,7 @@
import numpy as np
import pandas as pd
-from .. import properties
+from cdm_reader_mapper.metmetpy import properties
def datetime_decimalhour_to_hm(decimal_hours: float) -> tuple[int, int]:
diff --git a/src/cdm_reader_mapper/metmetpy/platform_type/correction_functions.py b/src/cdm_reader_mapper/metmetpy/platform_type/correction_functions.py
index 32d0bd19..1137a2ba 100755
--- a/src/cdm_reader_mapper/metmetpy/platform_type/correction_functions.py
+++ b/src/cdm_reader_mapper/metmetpy/platform_type/correction_functions.py
@@ -12,7 +12,7 @@
import pandas as pd
-from .. import properties
+from cdm_reader_mapper.metmetpy import properties
def is_num(x: Any) -> bool:
diff --git a/src/cdm_reader_mapper/metmetpy/validate.py b/src/cdm_reader_mapper/metmetpy/validate.py
index 16bc95f0..af1fc724 100755
--- a/src/cdm_reader_mapper/metmetpy/validate.py
+++ b/src/cdm_reader_mapper/metmetpy/validate.py
@@ -63,9 +63,8 @@
import pandas as pd
-from ..common import logging_hdlr
-from ..common.iterators import ProcessFunction, process_function
-from ..common.json_dict import collect_json_files, combine_dicts
+from cdm_reader_mapper.common import ProcessFunction, collect_json_files, combine_dicts, logging_hdlr, process_function
+
from . import properties
from .datetime import model_datetimes
diff --git a/tests/test_cdm_io.py b/tests/test_cdm_io.py
index 01aacea2..1050ca2f 100755
--- a/tests/test_cdm_io.py
+++ b/tests/test_cdm_io.py
@@ -202,7 +202,9 @@ def test_read_data_csv(csv_path, example_data):
data = bundle.data
assert isinstance(data, pd.DataFrame)
- pd.testing.assert_frame_equal(bundle.data, example_data.astype(str))
+ exp_data = example_data.astype(str).astype(object)
+
+ pd.testing.assert_frame_equal(bundle.data, exp_data)
def test_read_data_single_csv(csv_path, example_data):
@@ -214,7 +216,9 @@ def test_read_data_single_csv(csv_path, example_data):
data = bundle.data
assert isinstance(data, pd.DataFrame)
- pd.testing.assert_frame_equal(bundle.data, example_data["observations-sst"].astype(str))
+ exp_data = example_data["observations-sst"].astype(str).astype(object)
+
+ pd.testing.assert_frame_equal(bundle.data, exp_data)
def test_read_data_raises_data_format(csv_path):
@@ -244,7 +248,10 @@ def test_read_data_parquet(parquet_path, example_data):
data = bundle.data
assert isinstance(data, pd.DataFrame)
- pd.testing.assert_frame_equal(bundle.data, example_data)
+ exp_data = example_data.copy()
+ exp_data[("observations-sst", "B")] = exp_data[("observations-sst", "B")].astype(object)
+
+ pd.testing.assert_frame_equal(bundle.data, exp_data)
def test_read_data_feather(feather_path, example_data):
@@ -256,7 +263,10 @@ def test_read_data_feather(feather_path, example_data):
data = bundle.data
assert isinstance(data, pd.DataFrame)
- pd.testing.assert_frame_equal(bundle.data, example_data)
+ exp_data = example_data.copy()
+ exp_data[("observations-sst", "B")] = exp_data[("observations-sst", "B")].astype(object)
+
+ pd.testing.assert_frame_equal(bundle.data, exp_data)
def test_table_to_file_raises(csv_path, example_data):
@@ -307,6 +317,7 @@ def test_read_tables_testdata_str_conversion(tmp_path):
db_tmp = read_tables(tmp_path, suffix="str")
expected = read_tables(source, extension="pq", to_str=True, imodel=imodel)
+
pd.testing.assert_frame_equal(db_tmp.data["header"], expected.data)
diff --git a/tests/test_cdm_mapper.py b/tests/test_cdm_mapper.py
index 66d2cb70..33774566 100755
--- a/tests/test_cdm_mapper.py
+++ b/tests/test_cdm_mapper.py
@@ -61,7 +61,7 @@ def data_header_expected():
("header", "duplicate_status"): [4, 4, 4, 4],
("header", "platform_type"): [2, 33, 32, 45],
("header", "location_quality"): [2, 0, 0, 0],
- ("header", "source_id"): [pd.NA, pd.NA, pd.NA, pd.NA],
+ ("header", "source_id"): [None, None, None, None],
}
)
return data.astype(
@@ -493,39 +493,39 @@ def test_map_model_pub47():
]
dtypes = {
- ("header", "station_name"): str,
+ ("header", "station_name"): "object",
("header", "platform_sub_type"): "Int64",
- ("header", "primary_station_id"): str,
+ ("header", "primary_station_id"): "object",
("header", "station_record_number"): "Int64",
("header", "report_duration"): "Int64",
("observations-at", "sensor_automation_status"): "Int64",
("observations-at", "z_coordinate"): "Float64",
("observations-at", "observation_height_above_station_surface"): "Float64",
- ("observations-at", "sensor_id"): str,
+ ("observations-at", "sensor_id"): "object",
("observations-dpt", "sensor_automation_status"): "Int64",
("observations-dpt", "z_coordinate"): "Float64",
("observations-dpt", "observation_height_above_station_surface"): "Float64",
- ("observations-dpt", "sensor_id"): str,
+ ("observations-dpt", "sensor_id"): "object",
("observations-slp", "sensor_automation_status"): "Int64",
("observations-slp", "z_coordinate"): "Float64",
("observations-slp", "observation_height_above_station_surface"): "Float64",
- ("observations-slp", "sensor_id"): str,
+ ("observations-slp", "sensor_id"): "object",
("observations-sst", "sensor_automation_status"): "Int64",
("observations-sst", "z_coordinate"): "Float64",
("observations-sst", "observation_height_above_station_surface"): "Float64",
- ("observations-sst", "sensor_id"): str,
+ ("observations-sst", "sensor_id"): "object",
("observations-wbt", "sensor_automation_status"): "Int64",
("observations-wbt", "z_coordinate"): "Float64",
("observations-wbt", "observation_height_above_station_surface"): "Float64",
- ("observations-wbt", "sensor_id"): str,
+ ("observations-wbt", "sensor_id"): "object",
("observations-wd", "sensor_automation_status"): "Int64",
("observations-wd", "z_coordinate"): "Float64",
("observations-wd", "observation_height_above_station_surface"): "Float64",
- ("observations-wd", "sensor_id"): str,
+ ("observations-wd", "sensor_id"): "object",
("observations-ws", "sensor_automation_status"): "Int64",
("observations-ws", "z_coordinate"): "Float64",
("observations-ws", "observation_height_above_station_surface"): "Float64",
- ("observations-ws", "sensor_id"): str,
+ ("observations-ws", "sensor_id"): "object",
}
result = result[columns]
diff --git a/tests/test_common_select.py b/tests/test_common_select.py
index 8cf10e3a..7c633271 100755
--- a/tests/test_common_select.py
+++ b/tests/test_common_select.py
@@ -291,6 +291,8 @@ def test_split_by_boolean_psr_true(
)
exp = sample_psr.copy().read()
+ exp["B"] = exp["B"].astype(object)
+
exp_selected = exp.loc[exp_selected_idx]
exp_rejected = exp.loc[exp_rejected_idx]
@@ -369,6 +371,8 @@ def test_split_by_boolean_psr_false(
)
exp = sample_psr.copy().read()
+ exp["B"] = exp["B"].astype(object)
+
exp_selected = exp.loc[exp_selected_idx]
exp_rejected = exp.loc[exp_rejected_idx]
diff --git a/tests/test_common_utils.py b/tests/test_common_utils.py
index 1f5d5186..8d059279 100755
--- a/tests/test_common_utils.py
+++ b/tests/test_common_utils.py
@@ -8,6 +8,8 @@
from pathlib import Path
from urllib.parse import urlparse
+import numpy as np
+import pandas as pd
import pytest
import requests
@@ -28,6 +30,7 @@
open_json_file,
)
from cdm_reader_mapper.common.logging_hdlr import init_logger
+from cdm_reader_mapper.common.object_types import standardize_object_columns
def compute_md5(content: bytes) -> str:
@@ -481,3 +484,27 @@ def test_get_path_missing_file_module_not_found(tmp_path, caplog):
assert result is None
assert any("No module named" in msg for msg in caplog.messages)
+
+
+def test_standardize_object_columns():
+ df = pd.DataFrame(
+ {
+ "A": pd.Series(["x", "y", None], dtype="string"),
+ "B": pd.Series([1.1, 2.2, 3.3]),
+ "C": pd.Series([1, 2, 3]),
+ "D": pd.Series([True, False, np.nan]),
+ }
+ )
+
+ result = standardize_object_columns(df)
+
+ expected = pd.DataFrame(
+ {
+ "A": pd.Series(["x", "y", None], dtype=object),
+ "B": pd.Series([1.1, 2.2, 3.3]),
+ "C": pd.Series([1, 2, 3]),
+ "D": pd.Series([True, False, None], dtype=object),
+ }
+ )
+
+ pd.testing.assert_frame_equal(result, expected)
diff --git a/tests/test_databundle.py b/tests/test_databundle.py
index 94eab1d1..c285f34b 100755
--- a/tests/test_databundle.py
+++ b/tests/test_databundle.py
@@ -940,7 +940,7 @@ def test_correct_pt_psr():
result = db.correct_pt()
- expected = pd.DataFrame({PT: ["5", "7", "5"]})
+ expected = pd.DataFrame({PT: ["5", "7", "5"]}, dtype="object")
pd.testing.assert_frame_equal(result.data.read(), expected)
diff --git a/tests/test_mapper_conversions.py b/tests/test_mapper_conversions.py
index 2c2de3e0..d0a4cc67 100755
--- a/tests/test_mapper_conversions.py
+++ b/tests/test_mapper_conversions.py
@@ -97,7 +97,7 @@ def test_convert_array_general_to_str(exp, dtype, data):
def test_convert_str_to_str(data, exp):
series = pd.Series(data)
result = _convert_str_to_str(series, "null")
- expected = pd.Series(exp)
+ expected = pd.Series(exp, dtype=object)
pd.testing.assert_series_equal(result, expected)
@@ -164,8 +164,8 @@ def test_convert_str_array_from_str(data, exp):
def test_convert_integer_to_str(data, exp):
series = pd.Series(data)
result = _convert_integer_to_str(series, "null")
- expected_series = pd.Series(exp)
- pd.testing.assert_series_equal(result, expected_series)
+ expected = pd.Series(exp, dtype=object)
+ pd.testing.assert_series_equal(result, expected)
@pytest.mark.parametrize(
@@ -238,7 +238,7 @@ def test_convert_integer_array_from_str(data, exp):
def test_convert_float_to_str(data, decimal_places, exp):
series = pd.Series(data)
result = _convert_float_to_str(series, "null", decimal_places)
- expected = pd.Series(exp)
+ expected = pd.Series(exp, dtype=object)
pd.testing.assert_series_equal(result, expected)
@@ -278,7 +278,7 @@ def test_convert_float_from_str(data, exp):
def test_convert_float_array_to_str(data, exp):
series = pd.Series(data)
result = _convert_float_array_to_str(series, "null")
- expected = pd.Series(exp, dtype=object)
+ expected = pd.Series(exp)
pd.testing.assert_series_equal(result, expected)
diff --git a/tests/test_mapping_functions.py b/tests/test_mapping_functions.py
index f083ba7e..a914d655 100755
--- a/tests/test_mapping_functions.py
+++ b/tests/test_mapping_functions.py
@@ -311,10 +311,10 @@ def test_datetime_decimalhour_to_hm(row, expected_hr, expected_m):
(pd.DataFrame([]), pd.DatetimeIndex([])),
],
)
-def test_datetime_imma1(df, expected):
+def test_datetime_imma1_base(df, expected):
obj = MappingFunctions("dummy_model")
result = obj.datetime_imma1(df)
- pd.testing.assert_index_equal(result, expected)
+ pd.testing.assert_index_equal(result, expected.astype("datetime64[ns]"))
@pytest.mark.parametrize(
@@ -360,7 +360,7 @@ def test_datetime_imma1_to_utc(df, expected):
expected_naive = expected.tz_localize(None) if expected.tz else expected
- pd.testing.assert_index_equal(result, expected_naive)
+ pd.testing.assert_index_equal(result, expected_naive.astype("datetime64[ns]"))
@pytest.mark.parametrize(
@@ -394,7 +394,7 @@ def test_datetime_imma1_to_utc(df, expected):
def test_datetime_imma1_701(df, expected):
obj = MappingFunctions("dummy_model")
result = obj.datetime_imma1_701(df)
- pd.testing.assert_index_equal(result, expected)
+ pd.testing.assert_index_equal(result, expected.astype("datetime64[ns]"))
@pytest.mark.parametrize(
@@ -418,6 +418,7 @@ def test_datetime_imma1_701(df, expected):
def test_datetime_immt(df, expected):
obj = MappingFunctions("dummy_model")
result = obj.datetime_immt(df)
+ expected = expected.astype("datetime64[ns]")
pd.testing.assert_index_equal(result, expected)
@@ -480,6 +481,7 @@ def test_datetime_craid(df, expected):
def test_datetime_marob(df, expected):
obj = MappingFunctions("dummy_model")
result = obj.datetime_marob(df)
+ expected = expected.astype("datetime64[ns]")
pd.testing.assert_series_equal(result, expected)
@@ -756,13 +758,13 @@ def test_observing_programme(input_series, expected):
@pytest.mark.parametrize(
"input_series, prepend, append, separator, expected",
[
- (pd.Series(["a", None, "c"]), "X", "Y", "", pd.Series(["XaY", None, "XcY"])),
+ (pd.Series(["a", None, "c"], dtype=object), "X", "Y", "", pd.Series(["XaY", None, "XcY"], dtype=object)),
(
- pd.Series(["a", None, "c"]),
+ pd.Series(["a", None, "c"], dtype=object),
"pre",
"_app",
"-",
- pd.Series(["pre-a-_app", None, "pre-c-_app"]),
+ pd.Series(["pre-a-_app", None, "pre-c-_app"], dtype=object),
),
(pd.Series([]), "X", "Y", "", pd.Series([], dtype=object)),
],
@@ -770,7 +772,7 @@ def test_observing_programme(input_series, expected):
def test_string_add(input_series, prepend, append, separator, expected):
obj = MappingFunctions("dummy_model")
result = obj.string_add(input_series, prepend=prepend, append=append, separator=separator)
- pd.testing.assert_series_equal(result, expected)
+ pd.testing.assert_series_equal(result, expected.astype(object))
@pytest.mark.parametrize(
@@ -783,7 +785,7 @@ def test_string_add(input_series, prepend, append, separator, expected):
"-",
None,
None,
- pd.Series(["X-1-3-Y", "X-2-4-Y"]),
+ pd.Series(["X-1-3-Y", "X-2-4-Y"], dtype=object),
),
(
pd.DataFrame({"A": [1, 2], "B": [3, 4]}),
@@ -792,7 +794,7 @@ def test_string_add(input_series, prepend, append, separator, expected):
"-",
[0, 1],
[2, 2],
- pd.Series(["X-01-03-Y", "X-02-04-Y"]),
+ pd.Series(["X-01-03-Y", "X-02-04-Y"], dtype=object),
),
(
pd.DataFrame({"A": [1, 2], "B": [3, 4]}),
@@ -801,7 +803,7 @@ def test_string_add(input_series, prepend, append, separator, expected):
"-",
None,
None,
- pd.Series(["1-3", "2-4"]),
+ pd.Series(["1-3", "2-4"], dtype=object),
),
(
pd.DataFrame(columns=["A", "B"]),
@@ -819,7 +821,7 @@ def test_string_add(input_series, prepend, append, separator, expected):
":",
[0],
[3],
- pd.Series(["P:005:Q", "P:006:Q"]),
+ pd.Series(["P:005:Q", "P:006:Q"], dtype=object),
),
],
)
@@ -954,7 +956,7 @@ def test_feet_to_m(input_series, expected):
pd.DataFrame({"AAAA": [12], "MM": [3], "YY": [7], "GG": [5]}),
"",
"",
- pd.Series(["a57ea24d0eb65ca390a63bd175c906db"], dtype="object"),
+ pd.Series(["a57ea24d0eb65ca390a63bd175c906db"], dtype=object),
),
(
pd.DataFrame({"AAAA": [1, 2024], "MM": [1, 12], "YY": [1, 99], "GG": [1, 23]}),
@@ -965,20 +967,20 @@ def test_feet_to_m(input_series, expected):
"de3d414a8823554bbfde50f2305958d0",
"5f4b0ac6560552bf9e69cc9de0541bd6",
],
- dtype="object",
+ dtype=object,
),
),
(
pd.DataFrame({"AAAA": [50], "MM": [6], "YY": [24], "GG": [4]}),
"PRE-",
"-POST",
- pd.Series(["PRE-1d37cb121ceb546daba6431da61cd309-POST"], dtype="object"),
+ pd.Series(["PRE-1d37cb121ceb546daba6431da61cd309-POST"], dtype=object),
),
(
pd.DataFrame({"AAAA": [], "MM": [], "YY": [], "GG": []}),
"",
"",
- pd.Series([], dtype="object"),
+ pd.Series([], dtype=object),
),
],
)
diff --git a/tests/test_mdf_reader.py b/tests/test_mdf_reader.py
index 3ac303af..d1498854 100755
--- a/tests/test_mdf_reader.py
+++ b/tests/test_mdf_reader.py
@@ -43,6 +43,7 @@ def _read_mdf_test_data(data_model, select=None, drop=None, drop_idx=None, **kwa
data = test_data[f"test_{data_model}"]["mdf_data"]
mask = test_data[f"test_{data_model}"]["mdf_mask"]
+ print(data)
expected = read_data(data_file=data, mask_file=mask)
if not isinstance(result.data, pd.DataFrame):
@@ -457,13 +458,15 @@ def test_validate_read_mdf_args_invalid_years(tmp_path):
@pytest.fixture
def example_data():
- return pd.DataFrame(
+ df = pd.DataFrame(
{
"A": [1, 2, 3],
"B": [4.0, 5.0, 6.0],
"C": ["x", "y", "z"],
- }
+ },
)
+ df = df.astype({"A": "int", "B": "float", "C": "object"})
+ return df
@pytest.fixture
@@ -530,6 +533,7 @@ def feather_files(tmp_path, example_data, example_mask):
def test_read_data_with_mask_csv(csv_files, example_data, example_mask, example_info):
+ print(example_data)
data_file, mask_file, _ = csv_files
data, mask, info = _read_data(
data_file=data_file,
diff --git a/tests/test_metmetpy.py b/tests/test_metmetpy.py
index 746a1249..85f2a0f2 100755
--- a/tests/test_metmetpy.py
+++ b/tests/test_metmetpy.py
@@ -89,7 +89,7 @@ def test_icoads_to_datetime_basis():
]
)
)
- pd.testing.assert_series_equal(result, expected)
+ pd.testing.assert_series_equal(result, expected.astype("datetime64[ns]"))
def test_icoads_to_datetime_missing_values():
@@ -113,7 +113,7 @@ def test_icoads_to_datetime_missing_values():
]
)
)
- pd.testing.assert_series_equal(result, expected)
+ pd.testing.assert_series_equal(result, expected.astype("datetime64[ns]"))
def test_icoads_from_datetime_basis():
@@ -202,7 +202,9 @@ def test_icoads_to_datetime_missing_columns():
result = icoads(df, "to_datetime")
- pd.testing.assert_series_equal(result, pd.Series(pd.to_datetime([None])))
+ expected = pd.Series(pd.to_datetime([None]))
+
+ pd.testing.assert_series_equal(result, expected.astype("datetime64[ns]"))
def test_icoads_from_datetime_empty_series():
@@ -228,7 +230,7 @@ def test_to_datetime_basis():
expected = pd.Series(pd.to_datetime(["2000-01-10 12:30:00", "2001-02-15 06:15:00"]))
- pd.testing.assert_series_equal(result, expected)
+ pd.testing.assert_series_equal(result, expected.astype("datetime64[ns]"))
def test_to_datetime_no_correction():
@@ -894,7 +896,7 @@ def test_correct_pt_valid_iterable():
df2 = pd.DataFrame({PT: ["6", "7", None]}, index=[3, 4, 5])
result = correct_pt(ParquetStreamReader(iter([df1, df2])), "icoads_r300_d993")
- exp = pd.DataFrame({PT: ["5", "7", "5", "6", "7", "5"]})
+ exp = pd.DataFrame({PT: ["5", "7", "5", "6", "7", "5"]}, dtype="object")
pd.testing.assert_frame_equal(result.read(), exp)
diff --git a/tests/test_reader_convert_and_decode.py b/tests/test_reader_convert_and_decode.py
index adbc3eb9..3a22b9a1 100755
--- a/tests/test_reader_convert_and_decode.py
+++ b/tests/test_reader_convert_and_decode.py
@@ -139,8 +139,7 @@ def test_object_to_object_strip():
series = pd.Series([" a ", "", " ", "b"])
result = conv.object_to_object(series)
-
- assert result.tolist() == ["a", None, None, "b"]
+ assert result.tolist() == ["a", pd.NA, pd.NA, "b"]
def test_object_to_object_disable_lstrip():
diff --git a/tests/test_reader_utilities.py b/tests/test_reader_utilities.py
index 31bdda23..e59f6ecf 100755
--- a/tests/test_reader_utilities.py
+++ b/tests/test_reader_utilities.py
@@ -231,12 +231,14 @@ def test_adjust_dtype():
assert _adjust_dtype("str", df) == "str"
-def test_remove_boolean_values():
+def test_remove_boolean_values_sequence():
df = pd.DataFrame({"A": ["True", "False", "hello"], "B": [1, 2, 3]})
dtypes = {"A": "object", "B": "int"}
result = remove_boolean_values(df, dtypes)
- assert result.loc[0, "A"] is None
- assert result.loc[1, "A"] is None
+ import numpy as np
+
+ assert result.loc[0, "A"] is np.nan
+ assert result.loc[1, "A"] is np.nan
assert result.loc[2, "A"] == "hello"
assert result["B"].dtype.name == "int64"
diff --git a/tests/test_writers.py b/tests/test_writers.py
index 822369cd..6cfc6bbe 100755
--- a/tests/test_writers.py
+++ b/tests/test_writers.py
@@ -31,7 +31,6 @@ def db_data():
pattern = f"test_{imodel}"
data_file = test_data[pattern]["mdf_data"]
-
db = read(data_file, mode="data")
db.imodel = imodel
return db
@@ -40,13 +39,13 @@ def db_data():
def test_write_data_csv(tmp_path, db_data):
db_data.write(out_dir=tmp_path, data_format="csv")
tmppath = Path(tmp_path)
- db_res = read(
+ db_exp = read(
tmppath / "data.csv",
info_file=tmppath / "info.json",
data_format="csv",
mode="data",
)
- pd.testing.assert_frame_equal(db_data.data, db_res.data)
+ pd.testing.assert_frame_equal(db_data.data, db_exp.data)
def test_write_tables_csv(tmp_path, db_tables):