Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Contributor to this version: Ludwig Lierhammer (:user:`ludwiglierhammer`)
Announcements
^^^^^^^^^^^^^
* `cdm_reader_mapper` now drops support for Python 3.10 (:pull:`419`)
* `cdm_reader_mapper` now uses `cruft <https://cruft.github.io/cruft/>` and the `Ouranosinc cookiecutter template <https://github.com/Ouranosinc/cookiecutter-pypackage>` (:issue:`369`, :pull:`419`)
* `cdm_reader_mapper` now uses `cruft <https://cruft.github.io/cruft/>`_ and the `Ouranosinc cookiecutter template <https://github.com/Ouranosinc/cookiecutter-pypackage>`_ (:issue:`369`, :pull:`419`)

New features and enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand All @@ -20,7 +20,7 @@ Breaking changes
^^^^^^^^^^^^^^^^
* Development dependencies ("dev", "docs") are now installed via the new `dependency-groups` conventions (`PEP 735 <https://peps.python.org/pep-0735/>`_) (:pull:`419`)
* `prek` is now the suggested pre-commit runner (installed by default via `pip install --group dev`) (:pull:`419`)
* delete submodule ``src.cdm_reader_mapper.duplicates`` (:issue:`152`, :issue:`283`, :pull:`434`)
* submodule ``src.cdm_reader_mapper.duplicates`` has been deleted and moved to `marine_qc <https://github.com/glamod/marine_qc/pull/207/>`_ (:issue:`283`, :pull:`434`)

* ``cdm_reader_mapper.DupDetect`` is not importable anymore
* ``cdm_reader_mapper.duplicate_check`` is not importable anymore
Expand All @@ -30,7 +30,8 @@ Breaking changes
* ``cdm_reader_mapper.DataBundle.remove_duplicates`` is not callable anymore
* ``cdm_reader_mapper.DataBundle`` does not have attribute ``DupDetect`` anymore

* submodule ``src.cdm_reader_mapper.duplicates`` has been moved to `marine_qc <https://github.com/glamod/marine_qc/pull/207/>`_ (:issue:`283`, :pull:`434`)
* the code has been updated to support pandas >= 3.0.0 (:pull:`435`)
* reader functions now return object types instead of string types (:pull:`435`)


Internal changes
Expand All @@ -53,6 +54,8 @@ Internal changes
* remove helper class `core._utilities._DataBundle` and integrate it in `core.databundle.DataBundle` (:pull:`419`)
* make use of `pathlib.Path` instead of `os.path` (:pull:`419`)
* use consistently parameter "imodel" instead of "data_model" and "correction_method" instead of "fix_method" in `metmetpy` modules (:pull:`419`)
* make some JSON and iterator utility functions directly importable in submodule ``cdm_reader_mapper.common`` (:pull:`435`)
* replace relative imports with absolute imports for any submodules located one or more directories above the current submodule (:pull:`435`)


2.4.1 (2016-04-16)
Expand Down
2 changes: 1 addition & 1 deletion environment-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ dependencies:
- h5netcdf >=1.6.1
- h5py >=3.13.0
- numpy >=2.1.3
- pandas >=2.2.0
- pandas >=3.0.0
- pyarrow >=15.0.0 # Strongly encouraged for Pandas v2.2.0+
- timezonefinder >6.5.0
- xarray >=2023.11.0,!=2024.10.0
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ dependencies = [
"h5netcdf >=1.6.1",
"h5py >= 3.13.0",
"numpy>=2.1.3",
"pandas>=2.2.0",
"pandas>=3.0.0",
"platformdirs >4.0.0",
"pyarrow >=15.0.0",
"requests",
Expand Down
5 changes: 2 additions & 3 deletions src/cdm_reader_mapper/cdm_mapper/codes/codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@
from pathlib import Path
from typing import Any

from cdm_reader_mapper.common.json_dict import (
from cdm_reader_mapper.cdm_mapper import properties
from cdm_reader_mapper.common import (
collect_json_files,
combine_dicts,
open_json_file,
)

from .. import properties


def _eval(s: str) -> Any:
"""
Expand Down
11 changes: 6 additions & 5 deletions src/cdm_reader_mapper/cdm_mapper/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@

import pandas as pd

from cdm_reader_mapper.common import logging_hdlr
from cdm_reader_mapper.common.iterators import (
from cdm_reader_mapper.common import (
ParquetStreamReader,
ProcessFunction,
logging_hdlr,
process_function,
standardize_object_columns,
)

from . import properties
Expand Down Expand Up @@ -65,14 +66,14 @@ def _drop_duplicated_rows(df: pd.DataFrame) -> pd.DataFrame:
Parameters
----------
df : pd.DataFrame
Input DataFrame to drop duplictaed rows.
Input DataFrame to drop duplicated rows.

Returns
-------
pd.DataFrame
Input DataFrame with deleted duplicated rows.
"""
list_cols = [col for col in df.columns if df[col].apply(lambda x: isinstance(x, list)).any()]
list_cols = [col for col in df.columns if df[col].dtype == "object" and df[col].apply(lambda x: isinstance(x, list)).any()]

for col in list_cols:
df[col] = df[col].apply(lambda x: tuple(x) if isinstance(x, list) else x)
Expand Down Expand Up @@ -481,7 +482,7 @@ def _table_mapping(
if drop_duplicates:
table_df = _drop_duplicated_rows(table_df)

return table_df
return standardize_object_columns(table_df)


def _prepare_cdm_tables(cdm_subset: str | Sequence[str]) -> dict[str, Any]:
Expand Down
4 changes: 2 additions & 2 deletions src/cdm_reader_mapper/cdm_mapper/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from __future__ import annotations

from ..properties import SupportedDataModels, SupportedFileTypes
from cdm_reader_mapper.properties import SupportedDataModels, SupportedFileTypes


__all__ = [
Expand All @@ -27,7 +27,7 @@
"observations-slp",
]

# ...from CDM table definitions psuedo-sql(...) --------------------------------
# ...from CDM table definitions pseudo-sql(...) --------------------------------
pandas_dtypes: dict[str, dict[str, str]] = {}
pandas_dtypes["from_sql"] = {}
pandas_dtypes["from_sql"]["timestamp with timezone"] = "object"
Expand Down
9 changes: 5 additions & 4 deletions src/cdm_reader_mapper/cdm_mapper/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,10 @@

import pandas as pd

from cdm_reader_mapper.common import get_filename, logging_hdlr
from cdm_reader_mapper.common import get_filename, logging_hdlr, standardize_object_columns
from cdm_reader_mapper.core.databundle import DataBundle

from ..properties import SupportedFileTypes
from .properties import cdm_tables
from .properties import SupportedFileTypes, cdm_tables
from .utils.conversions import convert_from_str_df, convert_to_str_df
from .utils.utilities import get_cdm_subset, get_usecols

Expand Down Expand Up @@ -439,4 +438,6 @@ def read_tables(
elif to_str is True:
merged = convert_to_str_df(merged, imodel, cdm_subset=cdm_subset)

return DataBundle(data=merged, columns=merged.columns, mode="tables")
data = standardize_object_columns(merged)

return DataBundle(data=data, columns=merged.columns, mode="tables")
5 changes: 2 additions & 3 deletions src/cdm_reader_mapper/cdm_mapper/tables/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,13 @@
from pathlib import Path
from typing import Any

from cdm_reader_mapper.common.json_dict import (
from cdm_reader_mapper.cdm_mapper import properties
from cdm_reader_mapper.common import (
collect_json_files,
combine_dicts,
open_json_file,
)

from .. import properties


def get_cdm_atts(
cdm_tables: str | tuple[str, str] | Sequence[str | tuple[str, str]] | None = None,
Expand Down
16 changes: 10 additions & 6 deletions src/cdm_reader_mapper/cdm_mapper/utils/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
import numpy as np
import pandas as pd

from .. import properties
from ..tables.tables import get_cdm_atts, get_imodel_maps
from cdm_reader_mapper.cdm_mapper import properties
from cdm_reader_mapper.cdm_mapper.tables.tables import get_cdm_atts, get_imodel_maps


class BaseConverter:
Expand Down Expand Up @@ -269,7 +269,7 @@ def _convert_value(x: Any) -> str:

return "{" + ",".join(str_list) + "}"

return data.apply(_convert_value).astype(object)
return data.apply(_convert_value)


def _convert_str_to_str(data: pd.Series, null_label: str) -> pd.Series:
Expand Down Expand Up @@ -311,7 +311,7 @@ def _return_str(x: Any, null_label: str) -> str:
return null_label
return str(x)

return data.apply(lambda x: _return_str(x, null_label))
return data.apply(lambda x: _return_str(x, null_label)).astype(object)


def _convert_str_from_str(data: pd.Series, null_label: str) -> pd.Series:
Expand Down Expand Up @@ -372,7 +372,7 @@ def _convert_str_array_to_str(data: pd.Series, null_label: str) -> pd.Series:
pd.Series
Series with integer arrays in "{...}" format.
"""
return _convert_array_general_to_str(data, null_label, dtype=str)
return _convert_array_general_to_str(data, null_label, dtype=object)


def _convert_str_array_from_str(data: pd.Series, null_label: str) -> pd.Series:
Expand Down Expand Up @@ -662,7 +662,10 @@ def _convert_datetime_from_str(data: pd.Series, null_label: str) -> pd.Series:
Series with values converted to pandas datetime dtype (datetime64[ns]).
Invalid or non-convertible values are set to NaT.
"""
return pd.to_datetime(data, errors="coerce")
dt = pd.to_datetime(data, errors="coerce")
if dt.dt.tz is not None:
dt = dt.dt.tz_localize(None)
return dt.astype("datetime64[ns]")


def _convert_column(
Expand Down Expand Up @@ -832,6 +835,7 @@ def _convert_columns(
null_label,
converters,
)

return data


Expand Down
28 changes: 13 additions & 15 deletions src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,48 +369,47 @@ def datetime_imma1(self, df: pd.DataFrame) -> pd.DatetimeIndex:
DatetimeIndex of converted timestamps.
"""
if df.empty:
return pd.DatetimeIndex([])
return pd.DatetimeIndex([], dtype="datetime64[ns]")

df = df.iloc[:, 0:4]
date_format = "%Y-%m-%d-%H-%M"
hr_ = df.columns[-1]
df = df.assign(HR=df.iloc[:, -1])
df["M"] = df["HR"].copy()
df = df.drop(columns=hr_, axis=1)

df = df.drop(columns=hr_)
hr_min = df.apply(lambda x: self.datetime_decimalhour_to_hm(x), axis=1)
df["HR"] = hr_min["HR"]
df["M"] = hr_min["M"]
df = df.apply(lambda col: col.map(to_int))

strings = df.astype(str).apply("-".join, axis=1).values
strings = df.apply(lambda row: "-".join(map(str, row)), axis=1).values
result = pd.to_datetime(
strings,
format=date_format,
errors="coerce",
)
result.index = df.index
return result
return result.astype("datetime64[ns]")

def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex:
"""
Convert to pandas datetime object for IMMA1 deck 701 format.
Convert to pandas datetime object to UTC time.

Set missing hour to 12 and use latitude and longitude information
to convert local midday to UTC time.

Parameters
----------
df : pd.DataFrame
IMMA1 deck 701 dataset containing year, month, day, latitude, and longitude.
IMMA1 dataset containing year, month, day, latitude, and longitude.

Returns
-------
pd.DatetimeIndex
DatetimeIndex with timestamps converted to UTC.
"""
if df.empty:
return pd.DatetimeIndex([])
return pd.DatetimeIndex([], dtype="datetime64[ns]")

date_format = "%Y-%m-%d-%H-%M"

Expand All @@ -437,7 +436,7 @@ def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex:

results = df_time.apply(lambda x: convert_to_utc_i(x["Dates"], x["Time_zone"]), axis=1)
results.index = df.index
return pd.DatetimeIndex(results.dt.tz_convert(None))
return pd.DatetimeIndex(results.dt.tz_convert(None), dtype="datetime64[ns]")

def datetime_imma1_701(self, df: pd.DataFrame) -> pd.DatetimeIndex:
"""
Expand All @@ -454,7 +453,7 @@ def datetime_imma1_701(self, df: pd.DataFrame) -> pd.DatetimeIndex:
DatetimeIndex with converted timestamps.
"""
if df.empty:
return pd.DatetimeIndex([])
return pd.DatetimeIndex([], dtype="datetime64[ns]")

hr = df.iloc[:, 3]
valid_mask = hr.notna()
Expand Down Expand Up @@ -484,7 +483,7 @@ def datetime_immt(self, df: pd.DataFrame) -> pd.DatetimeIndex:
DatetimeIndex of converted timestamps.
"""
if df.empty:
return pd.DatetimeIndex([])
return pd.DatetimeIndex([], dtype="datetime64[ns]")

date_format = "%Y-%m-%d-%H-%M"
df = df.copy()
Expand All @@ -496,7 +495,7 @@ def datetime_immt(self, df: pd.DataFrame) -> pd.DatetimeIndex:
format=date_format,
errors="coerce",
)
return pd.DatetimeIndex(result)
return pd.DatetimeIndex(result).astype("datetime64[ns]")

def datetime_utcnow(self, df: pd.DataFrame) -> datetime.datetime:
"""
Expand Down Expand Up @@ -566,7 +565,7 @@ def datetime_marob(self, series: pd.Series, format: str = "%Y-%m-%dT%H:%M:%S") -
pd.Series
Series of converted dates.
"""
return series_strptime(series, format)
return series_strptime(series, format).astype("datetime64[ns]")

def df_col_join(self, df: pd.DataFrame, sep: str) -> pd.Series:
"""
Expand Down Expand Up @@ -819,7 +818,6 @@ def string_add(
Series with modified string values.
"""
result = np.vectorize(string_add_i, otypes="O")(prepend, series, append, separator)

return pd.Series(result, index=series.index, dtype="object")

def string_join_add(
Expand Down Expand Up @@ -1016,7 +1014,7 @@ def gdac_uid(self, df: pd.DataFrame, prepend: str = "", append: str = "") -> pd.
for i, n in enumerate(name):
uid[i] = str(prepend) + uuid.uuid5(uuid.NAMESPACE_OID, str(n)).hex + str(append)
df["UUID"] = uid
return df["UUID"]
return df["UUID"].astype(object)

def gdac_latitude(self, df: pd.DataFrame) -> pd.Series:
"""
Expand Down
2 changes: 1 addition & 1 deletion src/cdm_reader_mapper/cdm_mapper/utils/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections.abc import Iterable
from typing import Any

from .. import properties
from cdm_reader_mapper.cdm_mapper import properties


def dict_to_tuple_list(dic: dict[Any, Any]) -> list[tuple[Any, Any]]:
Expand Down
2 changes: 1 addition & 1 deletion src/cdm_reader_mapper/cdm_mapper/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

from cdm_reader_mapper.common import get_filename, logging_hdlr

from ..properties import SupportedFileTypes
from .properties import SupportedFileTypes
from .tables.tables import get_cdm_atts
from .utils.conversions import convert_from_str_df, convert_to_str_df
from .utils.utilities import adjust_filename, dict_to_tuple_list, get_cdm_subset
Expand Down
Loading
Loading