From 8a747571e2f83b2e67447dda6d99ced8f219a073 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 29 May 2026 17:46:27 +0200 Subject: [PATCH 01/13] string goes object --- src/cdm_reader_mapper/cdm_mapper/mapper.py | 9 +++-- src/cdm_reader_mapper/cdm_mapper/reader.py | 5 +++ .../cdm_mapper/utils/conversions.py | 12 ++++--- .../cdm_mapper/utils/mapping_functions.py | 28 +++++++-------- src/cdm_reader_mapper/common/iterators.py | 5 +++ .../duplicates/duplicates.py | 13 ++++--- src/cdm_reader_mapper/mdf_reader/reader.py | 8 +++++ .../mdf_reader/schemas/schemas.py | 2 -- .../mdf_reader/utils/convert_and_decode.py | 8 ++++- .../mdf_reader/utils/filereader.py | 5 +++ .../mdf_reader/utils/utilities.py | 7 ++-- .../mdf_reader/utils/validators.py | 6 ++-- tests/test_cdm_io.py | 19 ++++++++--- tests/test_cdm_mapper.py | 20 +++++------ tests/test_common_select.py | 4 +++ tests/test_databundle.py | 4 +-- tests/test_mapper_conversions.py | 10 +++--- tests/test_mapping_functions.py | 34 ++++++++++--------- tests/test_mdf_reader.py | 8 +++-- tests/test_metmetpy.py | 12 ++++--- tests/test_reader_convert_and_decode.py | 3 +- tests/test_reader_utilities.py | 8 +++-- tests/test_writers.py | 5 ++- 23 files changed, 150 insertions(+), 85 deletions(-) diff --git a/src/cdm_reader_mapper/cdm_mapper/mapper.py b/src/cdm_reader_mapper/cdm_mapper/mapper.py index 712823db..f8b4bdd9 100755 --- a/src/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/src/cdm_reader_mapper/cdm_mapper/mapper.py @@ -65,14 +65,14 @@ def _drop_duplicated_rows(df: pd.DataFrame) -> pd.DataFrame: Parameters ---------- df : pd.DataFrame - Input DataFrame to drop duplictaed rows. + Input DataFrame to drop duplicated rows. Returns ------- pd.DataFrame Input DataFrame with deleted duplicated rows. """ - list_cols = [col for col in df.columns if df[col].apply(lambda x: isinstance(x, list)).any()] + list_cols = [col for col in df.columns if df[col].dtype == "object" and df[col].apply(lambda x: isinstance(x, list)).any()] for col in list_cols: df[col] = df[col].apply(lambda x: tuple(x) if isinstance(x, list) else x) @@ -481,6 +481,11 @@ def _table_mapping( if drop_duplicates: table_df = _drop_duplicated_rows(table_df) + string_cols = table_df.select_dtypes(include="string").columns + table_df[string_cols] = table_df[string_cols].astype(object) + object_cols = table_df.select_dtypes(include="object").columns + table_df[object_cols] = table_df[object_cols].fillna(None) + return table_df diff --git a/src/cdm_reader_mapper/cdm_mapper/reader.py b/src/cdm_reader_mapper/cdm_mapper/reader.py index 8d5b0837..b8f2c428 100755 --- a/src/cdm_reader_mapper/cdm_mapper/reader.py +++ b/src/cdm_reader_mapper/cdm_mapper/reader.py @@ -439,4 +439,9 @@ def read_tables( elif to_str is True: merged = convert_to_str_df(merged, imodel, cdm_subset=cdm_subset) + string_cols = merged.select_dtypes(include="string").columns + merged[string_cols] = merged[string_cols].astype(object) + object_cols = merged.select_dtypes(include="object").columns + merged[object_cols] = merged[object_cols].fillna(None) + return DataBundle(data=merged, columns=merged.columns, mode="tables") diff --git a/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py b/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py index c919d364..47cf6eb3 100755 --- a/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py +++ b/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py @@ -269,7 +269,7 @@ def _convert_value(x: Any) -> str: return "{" + ",".join(str_list) + "}" - return data.apply(_convert_value).astype(object) + return data.apply(_convert_value) def _convert_str_to_str(data: pd.Series, null_label: str) -> pd.Series: @@ -311,7 +311,7 @@ def _return_str(x: Any, null_label: str) -> str: return null_label return str(x) - return data.apply(lambda x: _return_str(x, null_label)) + return data.apply(lambda x: _return_str(x, null_label)).astype(object) def _convert_str_from_str(data: pd.Series, null_label: str) -> pd.Series: @@ -372,7 +372,7 @@ def _convert_str_array_to_str(data: pd.Series, null_label: str) -> pd.Series: pd.Series Series with integer arrays in "{...}" format. """ - return _convert_array_general_to_str(data, null_label, dtype=str) + return _convert_array_general_to_str(data, null_label, dtype=object) def _convert_str_array_from_str(data: pd.Series, null_label: str) -> pd.Series: @@ -662,7 +662,10 @@ def _convert_datetime_from_str(data: pd.Series, null_label: str) -> pd.Series: Series with values converted to pandas datetime dtype (datetime64[ns]). Invalid or non-convertible values are set to NaT. """ - return pd.to_datetime(data, errors="coerce") + dt = pd.to_datetime(data, errors="coerce") + if dt.dt.tz is not None: + dt = dt.dt.tz_localize(None) + return dt.astype("datetime64[ns]") def _convert_column( @@ -832,6 +835,7 @@ def _convert_columns( null_label, converters, ) + return data diff --git a/src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py b/src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py index ef5abcd6..fe74b193 100755 --- a/src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py +++ b/src/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py @@ -369,32 +369,31 @@ def datetime_imma1(self, df: pd.DataFrame) -> pd.DatetimeIndex: DatetimeIndex of converted timestamps. """ if df.empty: - return pd.DatetimeIndex([]) + return pd.DatetimeIndex([], dtype="datetime64[ns]") df = df.iloc[:, 0:4] date_format = "%Y-%m-%d-%H-%M" hr_ = df.columns[-1] df = df.assign(HR=df.iloc[:, -1]) df["M"] = df["HR"].copy() - df = df.drop(columns=hr_, axis=1) - + df = df.drop(columns=hr_) hr_min = df.apply(lambda x: self.datetime_decimalhour_to_hm(x), axis=1) df["HR"] = hr_min["HR"] df["M"] = hr_min["M"] df = df.apply(lambda col: col.map(to_int)) - strings = df.astype(str).apply("-".join, axis=1).values + strings = df.apply(lambda row: "-".join(map(str, row)), axis=1).values result = pd.to_datetime( strings, format=date_format, errors="coerce", ) result.index = df.index - return result + return result.astype("datetime64[ns]") def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex: """ - Convert to pandas datetime object for IMMA1 deck 701 format. + Convert to pandas datetime object to UTC time. Set missing hour to 12 and use latitude and longitude information to convert local midday to UTC time. @@ -402,7 +401,7 @@ def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex: Parameters ---------- df : pd.DataFrame - IMMA1 deck 701 dataset containing year, month, day, latitude, and longitude. + IMMA1 dataset containing year, month, day, latitude, and longitude. Returns ------- @@ -410,7 +409,7 @@ def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex: DatetimeIndex with timestamps converted to UTC. """ if df.empty: - return pd.DatetimeIndex([]) + return pd.DatetimeIndex([], dtype="datetime64[ns]") date_format = "%Y-%m-%d-%H-%M" @@ -437,7 +436,7 @@ def datetime_imma1_to_utc(self, df: pd.DataFrame) -> pd.DatetimeIndex: results = df_time.apply(lambda x: convert_to_utc_i(x["Dates"], x["Time_zone"]), axis=1) results.index = df.index - return pd.DatetimeIndex(results.dt.tz_convert(None)) + return pd.DatetimeIndex(results.dt.tz_convert(None), dtype="datetime64[ns]") def datetime_imma1_701(self, df: pd.DataFrame) -> pd.DatetimeIndex: """ @@ -454,7 +453,7 @@ def datetime_imma1_701(self, df: pd.DataFrame) -> pd.DatetimeIndex: DatetimeIndex with converted timestamps. """ if df.empty: - return pd.DatetimeIndex([]) + return pd.DatetimeIndex([], dtype="datetime64[ns]") hr = df.iloc[:, 3] valid_mask = hr.notna() @@ -484,7 +483,7 @@ def datetime_immt(self, df: pd.DataFrame) -> pd.DatetimeIndex: DatetimeIndex of converted timestamps. """ if df.empty: - return pd.DatetimeIndex([]) + return pd.DatetimeIndex([], dtype="datetime64[ns]") date_format = "%Y-%m-%d-%H-%M" df = df.copy() @@ -496,7 +495,7 @@ def datetime_immt(self, df: pd.DataFrame) -> pd.DatetimeIndex: format=date_format, errors="coerce", ) - return pd.DatetimeIndex(result) + return pd.DatetimeIndex(result).astype("datetime64[ns]") def datetime_utcnow(self, df: pd.DataFrame) -> datetime.datetime: """ @@ -566,7 +565,7 @@ def datetime_marob(self, series: pd.Series, format: str = "%Y-%m-%dT%H:%M:%S") - pd.Series Series of converted dates. """ - return series_strptime(series, format) + return series_strptime(series, format).astype("datetime64[ns]") def df_col_join(self, df: pd.DataFrame, sep: str) -> pd.Series: """ @@ -819,7 +818,6 @@ def string_add( Series with modified string values. """ result = np.vectorize(string_add_i, otypes="O")(prepend, series, append, separator) - return pd.Series(result, index=series.index, dtype="object") def string_join_add( @@ -1016,7 +1014,7 @@ def gdac_uid(self, df: pd.DataFrame, prepend: str = "", append: str = "") -> pd. for i, n in enumerate(name): uid[i] = str(prepend) + uuid.uuid5(uuid.NAMESPACE_OID, str(n)).hex + str(append) df["UUID"] = uid - return df["UUID"] + return df["UUID"].astype(object) def gdac_latitude(self, df: pd.DataFrame) -> pd.Series: """ diff --git a/src/cdm_reader_mapper/common/iterators.py b/src/cdm_reader_mapper/common/iterators.py index de1bb350..753d183c 100755 --- a/src/cdm_reader_mapper/common/iterators.py +++ b/src/cdm_reader_mapper/common/iterators.py @@ -558,6 +558,11 @@ def _parquet_generator(temp_dir: TemporaryDirectory[str], data_type: type, schem for f in files: df = pd.read_parquet(f) + string_cols = df.select_dtypes(include="str").columns + df[string_cols] = df[string_cols].astype(object) + object_cols = df.select_dtypes(include="object").columns + df[object_cols] = df[object_cols].fillna(None) + if data_type is pd.Series: s = df.iloc[:, 0].copy() s.name = schema diff --git a/src/cdm_reader_mapper/duplicates/duplicates.py b/src/cdm_reader_mapper/duplicates/duplicates.py index 8722c721..211da533 100755 --- a/src/cdm_reader_mapper/duplicates/duplicates.py +++ b/src/cdm_reader_mapper/duplicates/duplicates.py @@ -50,10 +50,11 @@ def convert_date_to_float(date: pd.Series | pd.DatetimeIndex) -> pd.Series: df = df.copy() for column, method in conversion.items(): - try: - df[column] = df[column].astype(method) - except TypeError: + if method in locals(): df[column] = locals()[method](df[column]) + else: + df[column] = df[column].fillna(np.nan) + df[column] = df[column].astype(method) df = df.infer_objects(copy=False).fillna(9999.0) return df @@ -509,8 +510,12 @@ def replace_keeps_and_drops(df: pd.DataFrame, keep: Any) -> pd.DataFrame: result = add_history(result, indexes) result = result.sort_index(ascending=True) result = add_duplicates(result, duplicates) + result = result.astype(dtypes) + + object_cols = result.select_dtypes(include="object").columns + result[object_cols] = result[object_cols].fillna(None) - self.result = result.astype(dtypes) + self.result = result self.data = self.data.sort_index(ascending=True) return self.result diff --git a/src/cdm_reader_mapper/mdf_reader/reader.py b/src/cdm_reader_mapper/mdf_reader/reader.py index 0c85d4c3..e1116450 100755 --- a/src/cdm_reader_mapper/mdf_reader/reader.py +++ b/src/cdm_reader_mapper/mdf_reader/reader.py @@ -302,6 +302,14 @@ def _read_data( **mask_kwargs, ) + string_cols = data.select_dtypes(include="str").columns + data[string_cols] = data[string_cols].astype(object) + object_cols = data.select_dtypes(include="object").columns + data[object_cols] = data[object_cols].fillna(None) + + if "dtypes" in info: + info["dtypes"] = info["dtypes"].replace("str", "object") + return data, mask, info diff --git a/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py b/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py index 6ae0e845..fd193df9 100755 --- a/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py +++ b/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py @@ -123,8 +123,6 @@ def _resolve_schema_files( if ext_schema_path: schema_path = Path(ext_schema_path).resolve() path = schema_path / f"{schema_path.name}.json" - # print(path) - # exit() if not path.is_file(): raise FileNotFoundError(f"Can't find input schema path {ext_schema_path}") return [path] diff --git a/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py b/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py index 318770d2..3873bad9 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py @@ -263,6 +263,8 @@ def object_to_numeric( pd.Series Converted Series. """ + if data.dtype == "str": + data = data.astype(object) if data.dtype != "object": return data @@ -298,6 +300,8 @@ def object_to_object( pd.Series Cleaned Series. """ + if data.dtype == "str": + data = data.astype(object) if data.dtype != "object": return data @@ -308,7 +312,7 @@ def object_to_object( elif disable_white_strip == "r": data = data.str.lstrip() - return data.apply(lambda x: None if isinstance(x, str) and (x.isspace() or not x) else x) + return data.replace({"": pd.NA}) def object_to_datetime( self, @@ -332,6 +336,8 @@ def object_to_datetime( pd.Series Datetime Series. """ + if data.dtype == "str": + data = data.astype(object) if data.dtype != "object": return data diff --git a/src/cdm_reader_mapper/mdf_reader/utils/filereader.py b/src/cdm_reader_mapper/mdf_reader/utils/filereader.py index 6c2985ea..07665ccf 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/filereader.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/filereader.py @@ -279,6 +279,11 @@ def _process_data( for object_column in object_columns: data[object_column] = data[object_column].str.encode(config.encoding).str.decode("utf-8") + string_cols = data.select_dtypes(include="str").columns + data[string_cols] = data[string_cols].astype(object) + object_cols = data.select_dtypes(include="object").columns + data[object_cols] = data[object_cols].fillna(None) + return data, mask, config @process_function() diff --git a/src/cdm_reader_mapper/mdf_reader/utils/utilities.py b/src/cdm_reader_mapper/mdf_reader/utils/utilities.py index f34decac..ca940189 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/utilities.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/utilities.py @@ -451,10 +451,12 @@ def convert_str_boolean(x: Any) -> Any: bool or original value True if 'True', False if 'False', else original value. """ + if pd.isna(x): + return x if x == "True": - x = True + return True if x == "False": - x = False + return False return x @@ -498,4 +500,5 @@ def remove_boolean_values(data: pd.DataFrame, dtypes: dict[str, str]) -> pd.Data """ data = data.map(_remove_boolean_values) dtype = _adjust_dtype(dtypes, data) + data = data.apply(lambda series: series.str.strip() if series.dtype == "str" else series) return data.astype(dtype) diff --git a/src/cdm_reader_mapper/mdf_reader/utils/validators.py b/src/cdm_reader_mapper/mdf_reader/utils/validators.py index a8f0ffdd..110674fd 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/validators.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/validators.py @@ -237,9 +237,9 @@ def validate( # Explicit boolean literals ("True"/"False") override validation results if validated_columns: validated_columns = list(dict.fromkeys(validated_columns)) - to_bool = data[validated_columns].applymap(convert_str_boolean) - false_mask = to_bool.applymap(_is_false) - true_mask = to_bool.applymap(_is_true) + to_bool = data[validated_columns].apply(lambda col: col.map(convert_str_boolean)) + false_mask = to_bool.apply(lambda col: col.map(_is_false)) + true_mask = to_bool.apply(lambda col: col.map(_is_true)) mask[validated_columns] = mask[validated_columns].mask(false_mask, False) mask[validated_columns] = mask[validated_columns].mask(true_mask, True) diff --git a/tests/test_cdm_io.py b/tests/test_cdm_io.py index 01aacea2..1050ca2f 100755 --- a/tests/test_cdm_io.py +++ b/tests/test_cdm_io.py @@ -202,7 +202,9 @@ def test_read_data_csv(csv_path, example_data): data = bundle.data assert isinstance(data, pd.DataFrame) - pd.testing.assert_frame_equal(bundle.data, example_data.astype(str)) + exp_data = example_data.astype(str).astype(object) + + pd.testing.assert_frame_equal(bundle.data, exp_data) def test_read_data_single_csv(csv_path, example_data): @@ -214,7 +216,9 @@ def test_read_data_single_csv(csv_path, example_data): data = bundle.data assert isinstance(data, pd.DataFrame) - pd.testing.assert_frame_equal(bundle.data, example_data["observations-sst"].astype(str)) + exp_data = example_data["observations-sst"].astype(str).astype(object) + + pd.testing.assert_frame_equal(bundle.data, exp_data) def test_read_data_raises_data_format(csv_path): @@ -244,7 +248,10 @@ def test_read_data_parquet(parquet_path, example_data): data = bundle.data assert isinstance(data, pd.DataFrame) - pd.testing.assert_frame_equal(bundle.data, example_data) + exp_data = example_data.copy() + exp_data[("observations-sst", "B")] = exp_data[("observations-sst", "B")].astype(object) + + pd.testing.assert_frame_equal(bundle.data, exp_data) def test_read_data_feather(feather_path, example_data): @@ -256,7 +263,10 @@ def test_read_data_feather(feather_path, example_data): data = bundle.data assert isinstance(data, pd.DataFrame) - pd.testing.assert_frame_equal(bundle.data, example_data) + exp_data = example_data.copy() + exp_data[("observations-sst", "B")] = exp_data[("observations-sst", "B")].astype(object) + + pd.testing.assert_frame_equal(bundle.data, exp_data) def test_table_to_file_raises(csv_path, example_data): @@ -307,6 +317,7 @@ def test_read_tables_testdata_str_conversion(tmp_path): db_tmp = read_tables(tmp_path, suffix="str") expected = read_tables(source, extension="pq", to_str=True, imodel=imodel) + pd.testing.assert_frame_equal(db_tmp.data["header"], expected.data) diff --git a/tests/test_cdm_mapper.py b/tests/test_cdm_mapper.py index 66d2cb70..33774566 100755 --- a/tests/test_cdm_mapper.py +++ b/tests/test_cdm_mapper.py @@ -61,7 +61,7 @@ def data_header_expected(): ("header", "duplicate_status"): [4, 4, 4, 4], ("header", "platform_type"): [2, 33, 32, 45], ("header", "location_quality"): [2, 0, 0, 0], - ("header", "source_id"): [pd.NA, pd.NA, pd.NA, pd.NA], + ("header", "source_id"): [None, None, None, None], } ) return data.astype( @@ -493,39 +493,39 @@ def test_map_model_pub47(): ] dtypes = { - ("header", "station_name"): str, + ("header", "station_name"): "object", ("header", "platform_sub_type"): "Int64", - ("header", "primary_station_id"): str, + ("header", "primary_station_id"): "object", ("header", "station_record_number"): "Int64", ("header", "report_duration"): "Int64", ("observations-at", "sensor_automation_status"): "Int64", ("observations-at", "z_coordinate"): "Float64", ("observations-at", "observation_height_above_station_surface"): "Float64", - ("observations-at", "sensor_id"): str, + ("observations-at", "sensor_id"): "object", ("observations-dpt", "sensor_automation_status"): "Int64", ("observations-dpt", "z_coordinate"): "Float64", ("observations-dpt", "observation_height_above_station_surface"): "Float64", - ("observations-dpt", "sensor_id"): str, + ("observations-dpt", "sensor_id"): "object", ("observations-slp", "sensor_automation_status"): "Int64", ("observations-slp", "z_coordinate"): "Float64", ("observations-slp", "observation_height_above_station_surface"): "Float64", - ("observations-slp", "sensor_id"): str, + ("observations-slp", "sensor_id"): "object", ("observations-sst", "sensor_automation_status"): "Int64", ("observations-sst", "z_coordinate"): "Float64", ("observations-sst", "observation_height_above_station_surface"): "Float64", - ("observations-sst", "sensor_id"): str, + ("observations-sst", "sensor_id"): "object", ("observations-wbt", "sensor_automation_status"): "Int64", ("observations-wbt", "z_coordinate"): "Float64", ("observations-wbt", "observation_height_above_station_surface"): "Float64", - ("observations-wbt", "sensor_id"): str, + ("observations-wbt", "sensor_id"): "object", ("observations-wd", "sensor_automation_status"): "Int64", ("observations-wd", "z_coordinate"): "Float64", ("observations-wd", "observation_height_above_station_surface"): "Float64", - ("observations-wd", "sensor_id"): str, + ("observations-wd", "sensor_id"): "object", ("observations-ws", "sensor_automation_status"): "Int64", ("observations-ws", "z_coordinate"): "Float64", ("observations-ws", "observation_height_above_station_surface"): "Float64", - ("observations-ws", "sensor_id"): str, + ("observations-ws", "sensor_id"): "object", } result = result[columns] diff --git a/tests/test_common_select.py b/tests/test_common_select.py index 8cf10e3a..7c633271 100755 --- a/tests/test_common_select.py +++ b/tests/test_common_select.py @@ -291,6 +291,8 @@ def test_split_by_boolean_psr_true( ) exp = sample_psr.copy().read() + exp["B"] = exp["B"].astype(object) + exp_selected = exp.loc[exp_selected_idx] exp_rejected = exp.loc[exp_rejected_idx] @@ -369,6 +371,8 @@ def test_split_by_boolean_psr_false( ) exp = sample_psr.copy().read() + exp["B"] = exp["B"].astype(object) + exp_selected = exp.loc[exp_selected_idx] exp_rejected = exp.loc[exp_rejected_idx] diff --git a/tests/test_databundle.py b/tests/test_databundle.py index 635247be..468bb78e 100755 --- a/tests/test_databundle.py +++ b/tests/test_databundle.py @@ -927,7 +927,7 @@ def test_correct_pt_df(): result = db.correct_pt() - expected = pd.DataFrame({PT: ["5", "7", "5"]}) + expected = pd.DataFrame({PT: ["5", "7", "5"]}, dtype=object) pd.testing.assert_frame_equal(result.data, expected) @@ -941,7 +941,7 @@ def test_correct_pt_psr(): result = db.correct_pt() - expected = pd.DataFrame({PT: ["5", "7", "5"]}) + expected = pd.DataFrame({PT: ["5", "7", "5"]}, dtype=object) pd.testing.assert_frame_equal(result.data.read(), expected) diff --git a/tests/test_mapper_conversions.py b/tests/test_mapper_conversions.py index 2c2de3e0..d0a4cc67 100755 --- a/tests/test_mapper_conversions.py +++ b/tests/test_mapper_conversions.py @@ -97,7 +97,7 @@ def test_convert_array_general_to_str(exp, dtype, data): def test_convert_str_to_str(data, exp): series = pd.Series(data) result = _convert_str_to_str(series, "null") - expected = pd.Series(exp) + expected = pd.Series(exp, dtype=object) pd.testing.assert_series_equal(result, expected) @@ -164,8 +164,8 @@ def test_convert_str_array_from_str(data, exp): def test_convert_integer_to_str(data, exp): series = pd.Series(data) result = _convert_integer_to_str(series, "null") - expected_series = pd.Series(exp) - pd.testing.assert_series_equal(result, expected_series) + expected = pd.Series(exp, dtype=object) + pd.testing.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -238,7 +238,7 @@ def test_convert_integer_array_from_str(data, exp): def test_convert_float_to_str(data, decimal_places, exp): series = pd.Series(data) result = _convert_float_to_str(series, "null", decimal_places) - expected = pd.Series(exp) + expected = pd.Series(exp, dtype=object) pd.testing.assert_series_equal(result, expected) @@ -278,7 +278,7 @@ def test_convert_float_from_str(data, exp): def test_convert_float_array_to_str(data, exp): series = pd.Series(data) result = _convert_float_array_to_str(series, "null") - expected = pd.Series(exp, dtype=object) + expected = pd.Series(exp) pd.testing.assert_series_equal(result, expected) diff --git a/tests/test_mapping_functions.py b/tests/test_mapping_functions.py index f083ba7e..a914d655 100755 --- a/tests/test_mapping_functions.py +++ b/tests/test_mapping_functions.py @@ -311,10 +311,10 @@ def test_datetime_decimalhour_to_hm(row, expected_hr, expected_m): (pd.DataFrame([]), pd.DatetimeIndex([])), ], ) -def test_datetime_imma1(df, expected): +def test_datetime_imma1_base(df, expected): obj = MappingFunctions("dummy_model") result = obj.datetime_imma1(df) - pd.testing.assert_index_equal(result, expected) + pd.testing.assert_index_equal(result, expected.astype("datetime64[ns]")) @pytest.mark.parametrize( @@ -360,7 +360,7 @@ def test_datetime_imma1_to_utc(df, expected): expected_naive = expected.tz_localize(None) if expected.tz else expected - pd.testing.assert_index_equal(result, expected_naive) + pd.testing.assert_index_equal(result, expected_naive.astype("datetime64[ns]")) @pytest.mark.parametrize( @@ -394,7 +394,7 @@ def test_datetime_imma1_to_utc(df, expected): def test_datetime_imma1_701(df, expected): obj = MappingFunctions("dummy_model") result = obj.datetime_imma1_701(df) - pd.testing.assert_index_equal(result, expected) + pd.testing.assert_index_equal(result, expected.astype("datetime64[ns]")) @pytest.mark.parametrize( @@ -418,6 +418,7 @@ def test_datetime_imma1_701(df, expected): def test_datetime_immt(df, expected): obj = MappingFunctions("dummy_model") result = obj.datetime_immt(df) + expected = expected.astype("datetime64[ns]") pd.testing.assert_index_equal(result, expected) @@ -480,6 +481,7 @@ def test_datetime_craid(df, expected): def test_datetime_marob(df, expected): obj = MappingFunctions("dummy_model") result = obj.datetime_marob(df) + expected = expected.astype("datetime64[ns]") pd.testing.assert_series_equal(result, expected) @@ -756,13 +758,13 @@ def test_observing_programme(input_series, expected): @pytest.mark.parametrize( "input_series, prepend, append, separator, expected", [ - (pd.Series(["a", None, "c"]), "X", "Y", "", pd.Series(["XaY", None, "XcY"])), + (pd.Series(["a", None, "c"], dtype=object), "X", "Y", "", pd.Series(["XaY", None, "XcY"], dtype=object)), ( - pd.Series(["a", None, "c"]), + pd.Series(["a", None, "c"], dtype=object), "pre", "_app", "-", - pd.Series(["pre-a-_app", None, "pre-c-_app"]), + pd.Series(["pre-a-_app", None, "pre-c-_app"], dtype=object), ), (pd.Series([]), "X", "Y", "", pd.Series([], dtype=object)), ], @@ -770,7 +772,7 @@ def test_observing_programme(input_series, expected): def test_string_add(input_series, prepend, append, separator, expected): obj = MappingFunctions("dummy_model") result = obj.string_add(input_series, prepend=prepend, append=append, separator=separator) - pd.testing.assert_series_equal(result, expected) + pd.testing.assert_series_equal(result, expected.astype(object)) @pytest.mark.parametrize( @@ -783,7 +785,7 @@ def test_string_add(input_series, prepend, append, separator, expected): "-", None, None, - pd.Series(["X-1-3-Y", "X-2-4-Y"]), + pd.Series(["X-1-3-Y", "X-2-4-Y"], dtype=object), ), ( pd.DataFrame({"A": [1, 2], "B": [3, 4]}), @@ -792,7 +794,7 @@ def test_string_add(input_series, prepend, append, separator, expected): "-", [0, 1], [2, 2], - pd.Series(["X-01-03-Y", "X-02-04-Y"]), + pd.Series(["X-01-03-Y", "X-02-04-Y"], dtype=object), ), ( pd.DataFrame({"A": [1, 2], "B": [3, 4]}), @@ -801,7 +803,7 @@ def test_string_add(input_series, prepend, append, separator, expected): "-", None, None, - pd.Series(["1-3", "2-4"]), + pd.Series(["1-3", "2-4"], dtype=object), ), ( pd.DataFrame(columns=["A", "B"]), @@ -819,7 +821,7 @@ def test_string_add(input_series, prepend, append, separator, expected): ":", [0], [3], - pd.Series(["P:005:Q", "P:006:Q"]), + pd.Series(["P:005:Q", "P:006:Q"], dtype=object), ), ], ) @@ -954,7 +956,7 @@ def test_feet_to_m(input_series, expected): pd.DataFrame({"AAAA": [12], "MM": [3], "YY": [7], "GG": [5]}), "", "", - pd.Series(["a57ea24d0eb65ca390a63bd175c906db"], dtype="object"), + pd.Series(["a57ea24d0eb65ca390a63bd175c906db"], dtype=object), ), ( pd.DataFrame({"AAAA": [1, 2024], "MM": [1, 12], "YY": [1, 99], "GG": [1, 23]}), @@ -965,20 +967,20 @@ def test_feet_to_m(input_series, expected): "de3d414a8823554bbfde50f2305958d0", "5f4b0ac6560552bf9e69cc9de0541bd6", ], - dtype="object", + dtype=object, ), ), ( pd.DataFrame({"AAAA": [50], "MM": [6], "YY": [24], "GG": [4]}), "PRE-", "-POST", - pd.Series(["PRE-1d37cb121ceb546daba6431da61cd309-POST"], dtype="object"), + pd.Series(["PRE-1d37cb121ceb546daba6431da61cd309-POST"], dtype=object), ), ( pd.DataFrame({"AAAA": [], "MM": [], "YY": [], "GG": []}), "", "", - pd.Series([], dtype="object"), + pd.Series([], dtype=object), ), ], ) diff --git a/tests/test_mdf_reader.py b/tests/test_mdf_reader.py index 3ac303af..d1498854 100755 --- a/tests/test_mdf_reader.py +++ b/tests/test_mdf_reader.py @@ -43,6 +43,7 @@ def _read_mdf_test_data(data_model, select=None, drop=None, drop_idx=None, **kwa data = test_data[f"test_{data_model}"]["mdf_data"] mask = test_data[f"test_{data_model}"]["mdf_mask"] + print(data) expected = read_data(data_file=data, mask_file=mask) if not isinstance(result.data, pd.DataFrame): @@ -457,13 +458,15 @@ def test_validate_read_mdf_args_invalid_years(tmp_path): @pytest.fixture def example_data(): - return pd.DataFrame( + df = pd.DataFrame( { "A": [1, 2, 3], "B": [4.0, 5.0, 6.0], "C": ["x", "y", "z"], - } + }, ) + df = df.astype({"A": "int", "B": "float", "C": "object"}) + return df @pytest.fixture @@ -530,6 +533,7 @@ def feather_files(tmp_path, example_data, example_mask): def test_read_data_with_mask_csv(csv_files, example_data, example_mask, example_info): + print(example_data) data_file, mask_file, _ = csv_files data, mask, info = _read_data( data_file=data_file, diff --git a/tests/test_metmetpy.py b/tests/test_metmetpy.py index 746a1249..85f2a0f2 100755 --- a/tests/test_metmetpy.py +++ b/tests/test_metmetpy.py @@ -89,7 +89,7 @@ def test_icoads_to_datetime_basis(): ] ) ) - pd.testing.assert_series_equal(result, expected) + pd.testing.assert_series_equal(result, expected.astype("datetime64[ns]")) def test_icoads_to_datetime_missing_values(): @@ -113,7 +113,7 @@ def test_icoads_to_datetime_missing_values(): ] ) ) - pd.testing.assert_series_equal(result, expected) + pd.testing.assert_series_equal(result, expected.astype("datetime64[ns]")) def test_icoads_from_datetime_basis(): @@ -202,7 +202,9 @@ def test_icoads_to_datetime_missing_columns(): result = icoads(df, "to_datetime") - pd.testing.assert_series_equal(result, pd.Series(pd.to_datetime([None]))) + expected = pd.Series(pd.to_datetime([None])) + + pd.testing.assert_series_equal(result, expected.astype("datetime64[ns]")) def test_icoads_from_datetime_empty_series(): @@ -228,7 +230,7 @@ def test_to_datetime_basis(): expected = pd.Series(pd.to_datetime(["2000-01-10 12:30:00", "2001-02-15 06:15:00"])) - pd.testing.assert_series_equal(result, expected) + pd.testing.assert_series_equal(result, expected.astype("datetime64[ns]")) def test_to_datetime_no_correction(): @@ -894,7 +896,7 @@ def test_correct_pt_valid_iterable(): df2 = pd.DataFrame({PT: ["6", "7", None]}, index=[3, 4, 5]) result = correct_pt(ParquetStreamReader(iter([df1, df2])), "icoads_r300_d993") - exp = pd.DataFrame({PT: ["5", "7", "5", "6", "7", "5"]}) + exp = pd.DataFrame({PT: ["5", "7", "5", "6", "7", "5"]}, dtype="object") pd.testing.assert_frame_equal(result.read(), exp) diff --git a/tests/test_reader_convert_and_decode.py b/tests/test_reader_convert_and_decode.py index adbc3eb9..3a22b9a1 100755 --- a/tests/test_reader_convert_and_decode.py +++ b/tests/test_reader_convert_and_decode.py @@ -139,8 +139,7 @@ def test_object_to_object_strip(): series = pd.Series([" a ", "", " ", "b"]) result = conv.object_to_object(series) - - assert result.tolist() == ["a", None, None, "b"] + assert result.tolist() == ["a", pd.NA, pd.NA, "b"] def test_object_to_object_disable_lstrip(): diff --git a/tests/test_reader_utilities.py b/tests/test_reader_utilities.py index 31bdda23..e59f6ecf 100755 --- a/tests/test_reader_utilities.py +++ b/tests/test_reader_utilities.py @@ -231,12 +231,14 @@ def test_adjust_dtype(): assert _adjust_dtype("str", df) == "str" -def test_remove_boolean_values(): +def test_remove_boolean_values_sequence(): df = pd.DataFrame({"A": ["True", "False", "hello"], "B": [1, 2, 3]}) dtypes = {"A": "object", "B": "int"} result = remove_boolean_values(df, dtypes) - assert result.loc[0, "A"] is None - assert result.loc[1, "A"] is None + import numpy as np + + assert result.loc[0, "A"] is np.nan + assert result.loc[1, "A"] is np.nan assert result.loc[2, "A"] == "hello" assert result["B"].dtype.name == "int64" diff --git a/tests/test_writers.py b/tests/test_writers.py index 822369cd..6cfc6bbe 100755 --- a/tests/test_writers.py +++ b/tests/test_writers.py @@ -31,7 +31,6 @@ def db_data(): pattern = f"test_{imodel}" data_file = test_data[pattern]["mdf_data"] - db = read(data_file, mode="data") db.imodel = imodel return db @@ -40,13 +39,13 @@ def db_data(): def test_write_data_csv(tmp_path, db_data): db_data.write(out_dir=tmp_path, data_format="csv") tmppath = Path(tmp_path) - db_res = read( + db_exp = read( tmppath / "data.csv", info_file=tmppath / "info.json", data_format="csv", mode="data", ) - pd.testing.assert_frame_equal(db_data.data, db_res.data) + pd.testing.assert_frame_equal(db_data.data, db_exp.data) def test_write_tables_csv(tmp_path, db_tables): From cb65872846cca6bd3e072cef120128341b0458b8 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 24 Jun 2026 16:19:47 +0200 Subject: [PATCH 02/13] after reading TextFileReader strings are always objects --- tests/test_databundle.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_databundle.py b/tests/test_databundle.py index 468bb78e..20aa380a 100755 --- a/tests/test_databundle.py +++ b/tests/test_databundle.py @@ -927,7 +927,7 @@ def test_correct_pt_df(): result = db.correct_pt() - expected = pd.DataFrame({PT: ["5", "7", "5"]}, dtype=object) + expected = pd.DataFrame({PT: ["5", "7", "5"]}) pd.testing.assert_frame_equal(result.data, expected) @@ -941,7 +941,7 @@ def test_correct_pt_psr(): result = db.correct_pt() - expected = pd.DataFrame({PT: ["5", "7", "5"]}, dtype=object) + expected = pd.DataFrame({PT: ["5", "7", "5"]}, dtype="object") pd.testing.assert_frame_equal(result.data.read(), expected) From cbf490458e37d00f559f2151aa629c34bee7585c Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 24 Jun 2026 16:20:24 +0200 Subject: [PATCH 03/13] use pandas >= 3.0.0 that supports Python 3.11 or higher --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7a6dda0f..49447be5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,7 +93,7 @@ dependencies = [ "h5netcdf >=1.6.1", "h5py >= 3.13.0", "numpy>=2.1.3", - "pandas>=2.2.0", + "pandas>=3.0.0", "platformdirs >4.0.0", "pyarrow >=15.0.0", "recordlinkage >= 0.15", From c5f69cb4569a0a1a85614c474ddaa3d0c8b93d74 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 24 Jun 2026 16:31:28 +0200 Subject: [PATCH 04/13] remove vergs from the beginning of property docstrings. --- src/cdm_reader_mapper/data/__init__.py | 46 +++++++++++++------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/cdm_reader_mapper/data/__init__.py b/src/cdm_reader_mapper/data/__init__.py index 9f1431a4..4f585219 100755 --- a/src/cdm_reader_mapper/data/__init__.py +++ b/src/cdm_reader_mapper/data/__init__.py @@ -181,7 +181,7 @@ def _get_data_dict(self, data_file: str, data_model: str, source_ext: str = "csv @property def test_icoads_r300_d714(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 714 test dataset. + Test dataset for IMMA1 deck 714. Returns ------- @@ -197,7 +197,7 @@ def test_icoads_r300_d714(self) -> LazyDataDict: @property def test_icoads_r300_d701(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 701 test dataset. + Test dataset for IMMA1 deck 701. Returns ------- @@ -213,7 +213,7 @@ def test_icoads_r300_d701(self) -> LazyDataDict: @property def test_icoads_r300_d706(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 706 test dataset. + Test dataset for IMMA1 deck 706. Returns ------- @@ -229,7 +229,7 @@ def test_icoads_r300_d706(self) -> LazyDataDict: @property def test_icoads_r300_d705(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 705 test dataset. + Test dataset for IMMA1 deck 705. Returns ------- @@ -245,7 +245,7 @@ def test_icoads_r300_d705(self) -> LazyDataDict: @property def test_icoads_r300_d702(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 702 test dataset. + Test dataset for IMMA1 deck 702. Returns ------- @@ -261,7 +261,7 @@ def test_icoads_r300_d702(self) -> LazyDataDict: @property def test_icoads_r300_d707(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 707 test dataset. + Test dataset for IMMA1 deck 707. Returns ------- @@ -277,7 +277,7 @@ def test_icoads_r300_d707(self) -> LazyDataDict: @property def test_icoads_r300_mixed(self) -> LazyDataDict: """ - Retrieve IMMA1 mixed test dataset. + Test dataset for IMMA1 mixed. Returns ------- @@ -293,7 +293,7 @@ def test_icoads_r300_mixed(self) -> LazyDataDict: @property def test_icoads_r302_d794(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 794 test dataset. + Test dataset for IMMA1 deck 794. Returns ------- @@ -309,7 +309,7 @@ def test_icoads_r302_d794(self) -> LazyDataDict: @property def test_icoads_r300_d704(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 704 test dataset. + Test dataset for IMMA1 deck 704. Returns ------- @@ -325,7 +325,7 @@ def test_icoads_r300_d704(self) -> LazyDataDict: @property def test_icoads_r300_d721(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 721 test dataset. + Test dataset for IMMA1 deck 721. Returns ------- @@ -341,7 +341,7 @@ def test_icoads_r300_d721(self) -> LazyDataDict: @property def test_icoads_r300_d730(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 730 test dataset. + Test dataset for IMMA1 deck 730. Returns ------- @@ -357,7 +357,7 @@ def test_icoads_r300_d730(self) -> LazyDataDict: @property def test_icoads_r300_d781(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 781 test dataset. + Test dataset for IMMA1 deck 781. Returns ------- @@ -373,7 +373,7 @@ def test_icoads_r300_d781(self) -> LazyDataDict: @property def test_icoads_r300_d703(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 703 test dataset. + Test dataset for IMMA1 deck 703. Returns ------- @@ -389,7 +389,7 @@ def test_icoads_r300_d703(self) -> LazyDataDict: @property def test_icoads_r300_d201(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 201 test dataset. + Test dataset for IMMA1 deck 201. Returns ------- @@ -405,7 +405,7 @@ def test_icoads_r300_d201(self) -> LazyDataDict: @property def test_icoads_r300_d892(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 892 test dataset. + Test dataset for IMMA1 deck 892. Returns ------- @@ -421,7 +421,7 @@ def test_icoads_r300_d892(self) -> LazyDataDict: @property def test_icoads_r300_d700(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 700 test dataset. + Test dataset for IMMA1 deck 700. Returns ------- @@ -437,7 +437,7 @@ def test_icoads_r300_d700(self) -> LazyDataDict: @property def test_icoads_r302_d792(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 792 test dataset. + Test dataset for IMMA1 deck 792. Returns ------- @@ -453,7 +453,7 @@ def test_icoads_r302_d792(self) -> LazyDataDict: @property def test_icoads_r302_d992(self) -> LazyDataDict: """ - Retrieve IMMA1 deck 992 test dataset. + Test dataset for IMMA1 deck 992. Returns ------- @@ -469,7 +469,7 @@ def test_icoads_r302_d992(self) -> LazyDataDict: @property def test_gdac(self) -> LazyDataDict: """ - Retrieve IMMT test dataset. + Test dataset for IMMT. Returns ------- @@ -485,7 +485,7 @@ def test_gdac(self) -> LazyDataDict: @property def test_craid(self) -> LazyDataDict: """ - Retrieve C-RAID 1260810 test dataset. + Test dataset for C-RAID 1260810. Returns ------- @@ -497,7 +497,7 @@ def test_craid(self) -> LazyDataDict: @property def test_marob(self) -> LazyDataDict: """ - Retrieve MAROB (DWD database) test dataset. + Test dataset for MAROB (DWD database). Returns ------- @@ -509,7 +509,7 @@ def test_marob(self) -> LazyDataDict: @property def test_cmems(self) -> LazyDataDict: """ - Retrieve CMEMS (copernicusmarine) test dataset. + Test dataset for CMEMS (copernicusmarine). Returns ------- @@ -521,7 +521,7 @@ def test_cmems(self) -> LazyDataDict: @property def test_pub47(self) -> dict[str, Any]: """ - Retrieve Pub47 v202501 test dataset. + Test dataset for Pub47 v202501. Returns ------- From 9e6f4282eb909f1c025cb175cea49f1002a18ade Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 25 Jun 2026 09:32:28 +0200 Subject: [PATCH 05/13] remove no more used submodule --- .../duplicates/duplicates.py | 908 ------------------ 1 file changed, 908 deletions(-) delete mode 100755 src/cdm_reader_mapper/duplicates/duplicates.py diff --git a/src/cdm_reader_mapper/duplicates/duplicates.py b/src/cdm_reader_mapper/duplicates/duplicates.py deleted file mode 100755 index 211da533..00000000 --- a/src/cdm_reader_mapper/duplicates/duplicates.py +++ /dev/null @@ -1,908 +0,0 @@ -"""Common Data Model (CDM) pandas duplicate check.""" - -from __future__ import annotations -import datetime -from collections.abc import Iterable -from copy import deepcopy -from typing import Any - -import numpy as np -import pandas as pd -import recordlinkage as rl - -from ._duplicate_settings import Compare, _compare_kwargs, _histories, _method_kwargs - - -def convert_series(df: pd.DataFrame, conversion: dict[Any, Any]) -> pd.DataFrame: - """ - Convert data types in Dataframe. - - Parameters - ---------- - df : pd.DataFrame - Input DataFrame. - conversion : dict - Conversion dictionary conating columns and - new data type as key-value pairs. - - Returns - ------- - pd.DataFrame - DataFrame with converted data types. - """ - - def convert_date_to_float(date: pd.Series | pd.DatetimeIndex) -> pd.Series: - """ - Convert datetime values to float seconds relative to the minimum value. - - Parameters - ---------- - date : pd.Series or pd.DatetimeIndex - Datetime-like values to convert. - - Returns - ------- - pd.Series - Float values representing seconds since the minimum datetime in `date`. - """ - date = date.astype("datetime64[ns]") - return (date - date.min()) / np.timedelta64(1, "s") - - df = df.copy() - for column, method in conversion.items(): - if method in locals(): - df[column] = locals()[method](df[column]) - else: - df[column] = df[column].fillna(np.nan) - df[column] = df[column].astype(method) - - df = df.infer_objects(copy=False).fillna(9999.0) - return df - - -def add_history(df: pd.DataFrame, indexes: Iterable[int]) -> pd.DataFrame: - """ - Append duplicate information to the 'history' column of a DataFrame. - - Parameters - ---------- - df : pd.DataFrame - The DataFrame containing a 'history' column. - indexes : list[int] or pd.Index - Row indexes where history should be updated. - - Returns - ------- - pd.DataFrame - A new DataFrame with updated 'history' column for the selected rows. - - Notes - ----- - - If 'history' column does not exist, it will be created with empty strings. - - Each message is prefixed with a UTC timestamp in "YYYY-MM-DD HH:MM:SS" format. - """ - - def _datetime_now() -> str: - """ - Get actual datetime. - - Returns - ------- - str - Actual datetime as string representative ("%Y-%m-%d %H:%M:%S"). - """ - try: - now = datetime.datetime.now(datetime.UTC) - except AttributeError: - now = datetime.datetime.utcnow() - - return now.strftime("%Y-%m-%d %H:%M:%S") - - df = df.copy() - - if "history" not in df.columns: - df["history"] = "" - - history_tstmp = _datetime_now() - addition = "".join([f"; {history_tstmp}. {add}" for add in _histories.items()]) - df.loc[indexes, "history"] = df.loc[indexes, "history"] + addition - return df - - -def add_duplicates(df: pd.DataFrame, dups: pd.DataFrame) -> pd.DataFrame: - """ - Add duplicate information to the DataFrame based on the `dups` table. - - Parameters - ---------- - df : pd.DataFrame - DataFrame containing a 'report_id' column. - dups : pd.DataFrame - DataFrame where the index corresponds to rows in `df` and - the values are lists of duplicate indices or duplicate IDs. - - Returns - ------- - pd.DataFrame - A new DataFrame with a 'duplicates' column containing duplicates - as a sorted string list, e.g., "{ID1,ID2}". - - Notes - ----- - - If a row has no duplicates, its 'duplicates' column is left unchanged. - - Supports duplicates represented either by IDs (str) or by indices (int) of `report_id`. - """ - - def _add_dups(row: pd.Series) -> pd.Series: - """ - Add duplicates as string representatives to series. - - Parameters - ---------- - row : pd.Series - Single row of a pd.DataFrame. - - Returns - ------- - pd.Series - Duplicates as string representatives added to `row`. - """ - idx = row.name - if idx not in dups.index: - return row - - dup_idx = dups.loc[idx].to_list() - if isinstance(dup_idx[0][0], str): - v_ = sorted(dup_idx[0]) - else: - v_ = report_ids.iloc[dup_idx[0]] - v_ = sorted(v_.tolist()) - row["duplicates"] = "{" + ",".join(v_) + "}" - return row - - df = df.copy() - - if "duplicates" not in df.columns: - df["duplicates"] = "" - - report_ids = df["report_id"] - - dtypes = df.dtypes - result = df.apply(lambda x: _add_dups(x), axis=1) - return result.astype(dtypes) - - -def add_report_quality(df: pd.DataFrame, indexes_bad: Iterable[int]) -> pd.DataFrame: - """ - Update the 'report_quality' column in a DataFrame for bad reports. - - Parameters - ---------- - df : pd.DataFrame - DataFrame containing at least a 'report_quality' column. - indexes_bad : iterable of int - Row indices in the DataFrame to mark as bad quality (value=1). - - Returns - ------- - pd.DataFrame - DataFrame with updated 'report_quality' column. - """ - df = df.copy() - df["report_quality"] = df["report_quality"].astype(int) - df.loc[indexes_bad, "report_quality"] = 1 - return df - - -class DupDetect: - """ - Class to detect, flag, and remove duplicate entries in a DataFrame using a comparison matrix from recordlinkage. - - Parameters - ---------- - data : pd.DataFrame - Original dataset. - compared : pd.DataFrame - Comparison matrix of the dataset. - method : str - Duplicate detection method used for recordlinkage indexing. - method_kwargs : dict - Keyword arguments for recordlinkage indexing method. - compare_kwargs : dict - Keyword arguments used for recordlinkage.Compare. - """ - - def __init__( - self, - data: pd.DataFrame, - compared: pd.DataFrame, - method: str, - method_kwargs: dict[Any, Any], - compare_kwargs: dict[Any, Any], - ) -> None: - """ - Initialize a DupDetect instance. - - Parameters - ---------- - data : pd.DataFrame - Original dataset. - compared : pd.DataFrame - Comparison matrix of the dataset. - method : str - Duplicate detection method used for recordlinkage indexing. - method_kwargs : dict - Keyword arguments for recordlinkage indexing method. - compare_kwargs : dict - Keyword arguments used for recordlinkage.Compare. - """ - self.data = data.copy() - self.compared = compared - self.method = method - self.method_kwargs = method_kwargs - self.compare_kwargs = compare_kwargs - - def _get_limit(self, limit: str | float | None) -> float: - """ - Resolve the duplicate threshold limit. - - Parameters - ---------- - limit : str or float - 'default', None, or a numeric limit. - - Returns - ------- - float - Threshold for total score to consider duplicates. - """ - default_limit = 0.991 - if limit is None or limit == "default": - return default_limit - - return float(limit) - - def _get_equal_musts(self) -> list[str]: - """ - Identify columns that must be equal for duplicates. - - Returns - ------- - list[str] - Columns that must match exactly to consider duplicates. - """ - equal_musts: list[str] = [] - for value in self.compare_kwargs.keys(): - if isinstance(value, str): - value_lst = [value] - else: - value_lst = list(value) - equal_musts.extend(v for v in value_lst if v in self.data.columns) - return equal_musts - - def _total_score(self) -> None: - """Compute total similarity score for each row in `self.compared`.""" - pcmax = self.compared.shape[1] - self.score = 1 - (abs(self.compared.sum(axis=1) - pcmax) / pcmax) - - def get_duplicates( - self, - keep: str | int = "first", - limit: str | float | None = "default", - equal_musts: str | list[str] | None = None, - overwrite: bool = True, - ) -> pd.DataFrame: - """ - Identify duplicate matches based on the comparison matrix. - - Parameters - ---------- - keep : str or int - Which entry to keep: 'first', 'last', or -1, 0. - limit : str or float, optional, default: default - Threshold of total similarity score to consider as duplicate. - equal_musts : str or list[str], optional - Columns that must exactly match. - overwrite : bool, default: True - Whether to recompute matches if already calculated. - - Returns - ------- - pd.DataFrame - DataFrame containing matched duplicates. - """ - if keep not in ["first", "last", -1, 0]: - raise ValueError("keep has to be one of 'first', 'last', -1 or 0.") - - if keep == "first": - keep = -1 - elif keep == "last": - keep = 0 - - self.keep = keep - if keep == 0: - self.drop = -1 - elif keep == -1: - self.drop = 0 - - if overwrite is True: - self._total_score() - self.limit = self._get_limit(limit) - cond = self.score >= self.limit - if equal_musts is None: - equal_musts = self._get_equal_musts() - if isinstance(equal_musts, str): - equal_musts = [equal_musts] - for must in equal_musts: - cond = cond & (self.compared[must]) - self.matches = self.compared[cond] - return self.matches - - def flag_duplicates( - self, - keep: str | int = "first", - limit: str | float | None = "default", - equal_musts: str | list[str] | None = None, - ) -> pd.DataFrame: - r""" - Get result dataset with flagged duplicates. - - Parameters - ---------- - keep : str or int, default: first - Which entry should be kept in result dataset. - limit : str, int or float, optional - Limit of total score that as to be exceeded to be declared as a duplicate. - Defaults to .991. - equal_musts : str or list, optional - Hashable of column name(s) that must totally be equal to be declared as a duplicate. - Default: All column names found in method_kwargs. - - Returns - ------- - pd.DataFrame - Input DataFrame with flagged duplicates, including duplicate_status_ and quality_flag_. - - References - ---------- - .. _duplicate_status: https://glamod.github.io/cdm-obs-documentation/tables/code_tables/duplicate_status/duplicate_status.html - .. _quality_flag: https://glamod.github.io/cdm-obs-documentation/tables/code_tables/quality_flag/quality_flag.html - """ - - def _get_similars(drop_dict: dict[str | int, Any], keeps: Any) -> tuple[Any, Any]: - """ - Get similar entries from a comparison dictionary. - - Parameters - ---------- - drop_dict : dict - Dictionary containing values under keys `drop_` and `keep_` used - to determine similarity relationships. - keeps : Any - Reference collection used to determine whether a value in `drop` is - considered a match. - - Returns - ------- - tuple of Any and Any - A tuple containing the matched `drop` and `keep` values converted - to integers if possible. If the values are not convertible or no match - is found, returns `(None, None)`. - """ - if drop_dict[drop] in keeps: - drops = drop_dict[drop] - keeps = drop_dict[keep] - try: - return int(drops), int(keeps) - except ValueError: - return drops, keeps - - return None, None - - def _get_duplicates(x: pd.DataFrame, last: Any) -> pd.Series: - """ - Extract unique duplicate values from a DataFrame column. - - Parameters - ---------- - x : pd.DataFrame - Input DataFrame containing the column to inspect. - last : Any - Column name used to extract values for duplicate detection. - - Returns - ------- - pd.Series - Series containing a single key "dups" with the list of unique - duplicate values found in the specified column. - """ - b = list(set(x[last].values)) - return pd.Series({"dups": b}) - - def _delete_values_equal_keys(dictionary: dict[Any, Any]) -> tuple[dict[Any, Any], list[Any]]: - """ - Remove entries where keys and values are identical. - - Parameters - ---------- - dictionary : dict - Input mapping of keys to values. - - Returns - ------- - tuple of dict and list of Any - A tuple containing: - - A filtered dictionary with identical key-value pairs removed - - A list of values that were removed because key == value - """ - new_dictionary = {} - drops = [] - for k, v in dictionary.items(): - if k == v: - drops.append(v) - continue - new_dictionary[k] = v - return new_dictionary, drops - - def replace_keeps_and_drops(df: pd.DataFrame, keep: Any) -> pd.DataFrame: - """ - Iteratively resolve and replace duplicate mappings in a DataFrame. - - Parameters - ---------- - df : pd.DataFrame - Input DataFrame containing values to be deduplicated. - keep : Any - Column name used to identify canonical ("keep") values. - - Returns - ------- - pd.DataFrame - Updated DataFrame with resolved duplicate mappings and cleaned - keep-column values. - """ - keeps = df[keep].values - while True: - df = df.sort_index() - replaces = df.apply(lambda row, keeps=keeps: _get_similars(row, keeps), axis=1) - replaces = {k: v for k, v in dict(replaces.values).items() if k is not None} - replaces, drops = _delete_values_equal_keys(replaces) - keys = replaces.keys() - values = replaces.values() - if len(drops) > 0: - df = df.drop(drops, axis="index") - df[keep] = df[keep].replace(replaces) - if not set(keys).intersection(values): - return df - - self.get_duplicates(keep=keep, limit=limit, equal_musts=equal_musts) - result = self.data.copy() - - dtypes = result.dtypes - - result["duplicate_status"] = 0 - if not hasattr(self, "matches"): - self.get_duplicates(limit="default", equal_musts=equal_musts) - - indexes = self.matches.index - indexes_df = indexes.to_frame() - drop = indexes_df.columns[self.drop] - keep = indexes_df.columns[self.keep] - indexes_df = indexes_df.drop_duplicates(subset=[drop]) - indexes_df = replace_keeps_and_drops(indexes_df, keep) - - dup_keep = indexes_df.groupby(indexes_df[keep]).apply( - lambda x: _get_duplicates(x, drop), - include_groups=False, - ) - dup_drop = indexes_df.groupby(indexes_df[drop]).apply( - lambda x: _get_duplicates(x, keep), - include_groups=False, - ) - duplicates = pd.concat([dup_keep, dup_drop]) - - indexes_good = indexes_df[keep].values.tolist() - indexes_bad = indexes_df[drop].values.tolist() - indexes = indexes_good + indexes_bad - result.loc[indexes_good, "duplicate_status"] = 1 - result.loc[indexes_bad, "duplicate_status"] = 3 - result = add_report_quality(result, indexes_bad=indexes_bad) - result = add_history(result, indexes) - result = result.sort_index(ascending=True) - result = add_duplicates(result, duplicates) - result = result.astype(dtypes) - - object_cols = result.select_dtypes(include="object").columns - result[object_cols] = result[object_cols].fillna(None) - - self.result = result - self.data = self.data.sort_index(ascending=True) - - return self.result - - def remove_duplicates( - self, - keep: str | int = "first", - limit: str | float | None = "default", - equal_musts: str | list[str] | None = None, - ) -> pd.DataFrame: - """ - Remove duplicate entries from the dataset. - - Parameters - ---------- - keep : str or int - Which entry to keep ('first' or 'last'). - limit : str or float, optional - Minimum similarity score to declare duplicates. - equal_musts : str or list[str], optional - Columns that must exactly match. - - Returns - ------- - pd.DataFrame - Dataset without duplicates. - """ - self.get_duplicates(keep=keep, limit=limit, equal_musts=equal_musts) - result = self.data.copy() - drops = self.matches.index.get_level_values(self.drop) - result = result.drop(drops) - self.result = result.sort_index(ascending=True) - self.data = self.data.sort_index(ascending=True) - return self.result - - -def set_comparer(compare_dict: dict[Any, Any]) -> Compare: - """ - Build a recordlinkage Compare object with optional conversion dictionary. - - Parameters - ---------- - compare_dict : dict - Dictionary of columns to compare, - e.g. {"column_name": {"method": "exact" | "numeric" | "date2", "kwargs": {...}}}. - - Returns - ------- - recordlinkage.Compare - Compare object with added comparison methods and a 'conversion' attribute. - """ - comparer = Compare() - comparer.conversion = {} - for column, c_dict in compare_dict.items(): - try: - method = c_dict["method"] - except KeyError as err: - raise KeyError( - "compare_kwargs must be hierarchically ordered: {: {'method': }}. 'method' not found" - ) from err - try: - kwargs = c_dict["kwargs"] - except KeyError: - kwargs = {} - getattr(comparer, method)( - column, - column, - label=column, - **kwargs, - ) - if method == "numeric": - comparer.conversion[column] = float - if method == "date": - comparer.conversion[column] = "datetime64[ns]" - if method == "date2": - comparer.conversion[column] = "convert_date_to_float" - - return comparer - - -def remove_ignores(dic: dict[Any, Any], columns: str | list[str]) -> dict[Any, Any]: - """ - Remove dictionary entries where keys or values match ignored columns. - - Parameters - ---------- - dic : dict - Original dictionary to filter. - columns : str or list[str] - Column(s) to ignore. - - Returns - ------- - dict - Filtered dictionary without the ignored columns. - """ - new_dict = {} - if isinstance(columns, str): - columns = [columns] - for k, v in dic.items(): - if k in columns: - continue - if v in columns: - continue - if isinstance(v, list): - v2 = [v_ for v_ in v if v_ not in columns] - if len(v2) == 0: - continue - v = v2 - new_dict[k] = v - return new_dict - - -def change_offsets(dic: dict[Any, Any], dic_o: dict[Any, Any]) -> dict[Any, Any]: - """ - Update the 'offset' value in compare dictionary kwargs. - - Parameters - ---------- - dic : dict - Original compare dictionary. - dic_o : dict - Dictionary mapping column names to new offsets. - - Returns - ------- - dict - Updated compare dictionary with modified offsets. - """ - for key in dic.keys(): - if key not in dic_o.keys(): - continue - dic[key]["kwargs"]["offset"] = dic_o[key] - return dic - - -def reindex_nulls(df: pd.DataFrame, null_label: Any) -> pd.DataFrame: - """ - Reindex a DataFrame in ascending order based on the number of 'null' strings in each row. - - Parameters - ---------- - df : pd.DataFrame - Input DataFrame. Cells with the string "null" are counted as nulls. - null_label : Any - Missing value representative. - - Returns - ------- - pd.DataFrame - DataFrame reindexed so that rows with fewer 'null' values appear first. - Original row order is preserved for rows with the same null count. - """ - - def is_missing(x: Any) -> bool: - """ - Determine whether a value is considered missing. - - This function supports scalar values as well as nested iterables - (lists, tuples, numpy arrays). A value is considered missing if it is: - - NaN (as defined by ``pandas.isna``) - - Equal to ``null_label`` - - Any element inside an iterable is missing (recursively checked) - - Parameters - ---------- - x : Any - Value to check for missingness. - - Returns - ------- - bool - True if the value (or any nested value) is missing, otherwise False. - """ - if isinstance(x, (list, tuple, np.ndarray)): - return any(is_missing(x_) for x_ in x) - - if pd.isna(x): - return True - - if x == null_label: - return True - - return False - - def count_nulls(row: pd.Series) -> int: - """ - Count the number of missing values in a pandas Series. - - Parameters - ---------- - row : pd.Series - Input row or Series to evaluate. - - Returns - ------- - int - Number of missing values in the Series. - """ - return sum(is_missing(x) for x in row) - - null_counts = df.apply(count_nulls, axis=1) - - if null_counts.empty: - return df - - sorted_index = null_counts.sort_values(kind="stable").index - return df.loc[sorted_index] - - -class Comparer: - """ - Wrapper around recordlinkage.Compare to compute pairwise comparisons on a DataFrame. - - This class initializes a recordlinkage indexer and Compare object, optionally converting - the data types before computing the comparisons. - - Parameters - ---------- - data : pd.DataFrame - The dataset to compare. - method : str - The indexing method from `recordlinkage.index`, e.g., 'SortedNeighbourhood'. - method_kwargs : dict - Keyword arguments to pass to the indexing method. - compare_kwargs : dict - Dictionary specifying columns and comparison methods for recordlinkage.Compare. - pairs_df : list[pd.DataFrame], optional - Optional pre-split DataFrames to pass to the indexer. Defaults to `[data]`. - convert_data : bool, default False - Whether to convert data using `compare_kwargs` conversion dictionary. - """ - - def __init__( - self, - data: pd.DataFrame, - method: str, - method_kwargs: dict[Any, Any], - compare_kwargs: dict[Any, Any], - pairs_df: list[pd.DataFrame] | None = None, - convert_data: bool = False, - ): - """ - Initialize a Comparer instance. - - Parameters - ---------- - data : pd.DataFrame - The dataset to compare. - method : str - The indexing method from `recordlinkage.index`, e.g., 'SortedNeighbourhood'. - method_kwargs : dict - Keyword arguments to pass to the indexing method. - compare_kwargs : dict - Dictionary specifying columns and comparison methods for recordlinkage.Compare. - pairs_df : list[pd.DataFrame], optional - Optional pre-split DataFrames to pass to the indexer. Defaults to `[data]`. - convert_data : bool, default False - Whether to convert data using `compare_kwargs` conversion dictionary. - """ - indexer = getattr(rl.index, method)(**method_kwargs) - comparer = set_comparer(compare_kwargs) - if convert_data is True: - data_cp = convert_series(data, comparer.conversion) - else: - data_cp = data.copy() - - if pairs_df is None: - pairs_df = [data_cp] - pairs = indexer.index(*pairs_df) - self.compared = comparer.compute(pairs, data_cp) - self.data = data_cp - - -def duplicate_check( - data: pd.DataFrame, - method: str = "SortedNeighbourhood", - method_kwargs: dict[Any, Any] | None = None, - compare_kwargs: dict[Any, Any] | None = None, - table_name: str | None = None, - ignore_columns: str | None = None, - ignore_entries: dict[str, Any] | None = None, - offsets: dict[str, Any] | None = None, - reindex_by_null: bool = True, - null_label: Any = "null", -) -> DupDetect: - """ - Run a duplicate check on a dataset using recordlinkage. - - Returns a DupDetect object. - - Parameters - ---------- - data : pandas.DataFrame - Dataset for duplicate check. - method : str, default: SortedNeighbourhood - Duplicate check method for recordlinkage. - method_kwargs : dict, optional - Keyword arguments for recordlinkage duplicate check. - Defaults to _method_kwargs. - compare_kwargs : dict, optional - Keyword arguments for recordlinkage.Compare object. - Defaults to _compare_kwargs. - table_name : str, optional - Name of the CDM table to be selected from data. - ignore_columns : str or list, optional - Name of data columns to be ignored for duplicate check. - ignore_entries : dict, optional - Key: Column name. - Value: value to be ignored. - E.g. offsets={"station_speed": null}. - offsets : dict, optional - Change offsets for recordlinkage Compare object. - Key: Column name. - Value: new offset. - E.g. offsets={"latitude": 0.1}. - reindex_by_null : bool, optional - If True data is re-indexed in ascending order according to the number of nulls in each row. - null_label : str, optional - Null label which is used if `reindex_by_null` is True. - - Returns - ------- - cdm_reader_mapper.DupDetect - A DupDetect instance. - """ - if reindex_by_null is True: - data = reindex_nulls(data, null_label=null_label) - - index = data.index - data.reset_index(drop=True) - - if table_name: - data = data[table_name] - if not method_kwargs: - method_kwargs = deepcopy(_method_kwargs) - if not compare_kwargs: - compare_kwargs = deepcopy(_compare_kwargs) - if ignore_columns: - method_kwargs = remove_ignores(method_kwargs, ignore_columns) - compare_kwargs = remove_ignores(compare_kwargs, ignore_columns) - if offsets: - compare_kwargs = change_offsets(compare_kwargs, offsets) - - dtypes = data.dtypes - - comparer = Comparer( - data=data, - method=method, - method_kwargs=method_kwargs, - compare_kwargs=compare_kwargs, - convert_data=True, - ) - compared = comparer.compared - data_ = comparer.data - - if ignore_entries is None: - return DupDetect(data, compared, method, method_kwargs, compare_kwargs) - - compared = [compared] - - for column_, entry_ in ignore_entries.items(): - if not isinstance(entry_, list): - entry_ = [entry_] - entries = data[column_].isin(entry_) - - d1 = data.mask(entries).dropna(how="all") - d2 = data.where(entries).dropna(how="all") - if d1.empty: - continue - if d2.empty: - continue - - method_kwargs_ = remove_ignores(method_kwargs, column_) - compare_kwargs_ = remove_ignores(compare_kwargs, column_) - - compared_ = Comparer( - data=data_, - method=method, - method_kwargs=method_kwargs_, - compare_kwargs=compare_kwargs_, - pairs_df=[d2, d1], - ).compared - compared_[list(ignore_entries.keys())] = 1 - compared.append(compared_) - - compared = pd.concat(compared) - data.set_index(index, inplace=True) - data = data.astype(dtypes) - return DupDetect(data, compared, method, method_kwargs, compare_kwargs) From 389f5fd00eb0112279ea4bf9fdb40813fbe081a7 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 25 Jun 2026 09:54:20 +0200 Subject: [PATCH 06/13] update CHANGELOG --- CHANGELOG.rst | 7 ++++--- environment-docs.yml | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 69bab7ad..3a337a1d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,7 +10,7 @@ Contributor to this version: Ludwig Lierhammer (:user:`ludwiglierhammer`) Announcements ^^^^^^^^^^^^^ * `cdm_reader_mapper` now drops support for Python 3.10 (:pull:`419`) -* `cdm_reader_mapper` now uses `cruft ` and the `Ouranosinc cookiecutter template ` (:issue:`369`, :pull:`419`) +* `cdm_reader_mapper` now uses `cruft `_ and the `Ouranosinc cookiecutter template `_ (:issue:`369`, :pull:`419`) New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -20,7 +20,7 @@ Breaking changes ^^^^^^^^^^^^^^^^ * Development dependencies ("dev", "docs") are now installed via the new `dependency-groups` conventions (`PEP 735 `_) (:pull:`419`) * `prek` is now the suggested pre-commit runner (installed by default via `pip install --group dev`) (:pull:`419`) -* delete submodule ``src.cdm_reader_mapper.duplicates`` (:issue:`152`, :issue:`283`, :pull:`434`) +* submodule ``src.cdm_reader_mapper.duplicates`` has been deleted and moved to `marine_qc `_ (:issue:`283`, :pull:`434`) * ``cdm_reader_mapper.DupDetect`` is not importable anymore * ``cdm_reader_mapper.duplicate_check`` is not importable anymore @@ -30,7 +30,8 @@ Breaking changes * ``cdm_reader_mapper.DataBundle.remove_duplicates`` is not callable anymore * ``cdm_reader_mapper.DataBundle`` does not have attribute ``DupDetect`` anymore -* submodule ``src.cdm_reader_mapper.duplicates`` has been moved to `marine_qc `_ (:issue:`283`, :pull:`434`) +* the code has been updated to support pandas >= 3.0.0 (:pull:`435`) +* reader functions now return object types instead of string types (:pull:`435`) Internal changes diff --git a/environment-docs.yml b/environment-docs.yml index e67b4c5b..b9676e4f 100755 --- a/environment-docs.yml +++ b/environment-docs.yml @@ -12,7 +12,7 @@ dependencies: - h5netcdf >=1.6.1 - h5py >=3.13.0 - numpy >=2.1.3 - - pandas >=2.2.0 + - pandas >=3.0.0 - pyarrow >=15.0.0 # Strongly encouraged for Pandas v2.2.0+ - timezonefinder >6.5.0 - xarray >=2023.11.0,!=2024.10.0 From 472b59f4646ca5bb57521ac8372e7b757874f493 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 25 Jun 2026 11:28:11 +0200 Subject: [PATCH 07/13] Replace relative imports with absolute imports for any submodules located one or more directories above the current submodule. --- .../cdm_mapper/codes/codes.py | 5 ++-- src/cdm_reader_mapper/cdm_mapper/mapper.py | 12 +++------ .../cdm_mapper/properties.py | 4 +-- src/cdm_reader_mapper/cdm_mapper/reader.py | 12 +++------ .../cdm_mapper/tables/tables.py | 5 ++-- .../cdm_mapper/utils/conversions.py | 4 +-- .../cdm_mapper/utils/utilities.py | 2 +- src/cdm_reader_mapper/cdm_mapper/writer.py | 2 +- src/cdm_reader_mapper/common/iterators.py | 7 +++-- src/cdm_reader_mapper/core/_utilities.py | 2 +- src/cdm_reader_mapper/core/databundle.py | 3 ++- src/cdm_reader_mapper/core/reader.py | 2 +- src/cdm_reader_mapper/core/writer.py | 3 +-- .../mdf_reader/codes/codes.py | 5 ++-- .../mdf_reader/properties.py | 2 +- src/cdm_reader_mapper/mdf_reader/reader.py | 9 +++---- .../mdf_reader/schemas/schemas.py | 5 ++-- .../mdf_reader/utils/convert_and_decode.py | 3 ++- .../mdf_reader/utils/filereader.py | 9 +++---- .../mdf_reader/utils/parser.py | 5 ++-- .../mdf_reader/utils/utilities.py | 2 +- .../mdf_reader/utils/validators.py | 5 ++-- src/cdm_reader_mapper/mdf_reader/writer.py | 7 ++--- src/cdm_reader_mapper/metmetpy/correct.py | 5 ++-- .../metmetpy/datetime/correction_functions.py | 3 ++- .../metmetpy/datetime/model_datetimes.py | 2 +- .../platform_type/correction_functions.py | 2 +- src/cdm_reader_mapper/metmetpy/validate.py | 5 ++-- tests/test_common_utils.py | 27 +++++++++++++++++++ 29 files changed, 85 insertions(+), 74 deletions(-) diff --git a/src/cdm_reader_mapper/cdm_mapper/codes/codes.py b/src/cdm_reader_mapper/cdm_mapper/codes/codes.py index e3388a67..a9a0c058 100755 --- a/src/cdm_reader_mapper/cdm_mapper/codes/codes.py +++ b/src/cdm_reader_mapper/cdm_mapper/codes/codes.py @@ -13,14 +13,13 @@ from pathlib import Path from typing import Any -from cdm_reader_mapper.common.json_dict import ( +from cdm_reader_mapper.cdm_mapper import properties +from cdm_reader_mapper.common import ( collect_json_files, combine_dicts, open_json_file, ) -from .. import properties - def _eval(s: str) -> Any: """ diff --git a/src/cdm_reader_mapper/cdm_mapper/mapper.py b/src/cdm_reader_mapper/cdm_mapper/mapper.py index f8b4bdd9..3a8f28ab 100755 --- a/src/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/src/cdm_reader_mapper/cdm_mapper/mapper.py @@ -18,11 +18,12 @@ import pandas as pd -from cdm_reader_mapper.common import logging_hdlr -from cdm_reader_mapper.common.iterators import ( +from cdm_reader_mapper.common import ( ParquetStreamReader, ProcessFunction, + logging_hdlr, process_function, + standardize_object_columns, ) from . import properties @@ -481,12 +482,7 @@ def _table_mapping( if drop_duplicates: table_df = _drop_duplicated_rows(table_df) - string_cols = table_df.select_dtypes(include="string").columns - table_df[string_cols] = table_df[string_cols].astype(object) - object_cols = table_df.select_dtypes(include="object").columns - table_df[object_cols] = table_df[object_cols].fillna(None) - - return table_df + return standardize_object_columns(table_df) def _prepare_cdm_tables(cdm_subset: str | Sequence[str]) -> dict[str, Any]: diff --git a/src/cdm_reader_mapper/cdm_mapper/properties.py b/src/cdm_reader_mapper/cdm_mapper/properties.py index f122ca61..bcf62394 100755 --- a/src/cdm_reader_mapper/cdm_mapper/properties.py +++ b/src/cdm_reader_mapper/cdm_mapper/properties.py @@ -2,7 +2,7 @@ from __future__ import annotations -from ..properties import SupportedDataModels, SupportedFileTypes +from cdm_reader_mapper.properties import SupportedDataModels, SupportedFileTypes __all__ = [ @@ -27,7 +27,7 @@ "observations-slp", ] -# ...from CDM table definitions psuedo-sql(...) -------------------------------- +# ...from CDM table definitions pseudo-sql(...) -------------------------------- pandas_dtypes: dict[str, dict[str, str]] = {} pandas_dtypes["from_sql"] = {} pandas_dtypes["from_sql"]["timestamp with timezone"] = "object" diff --git a/src/cdm_reader_mapper/cdm_mapper/reader.py b/src/cdm_reader_mapper/cdm_mapper/reader.py index b8f2c428..45282b9c 100755 --- a/src/cdm_reader_mapper/cdm_mapper/reader.py +++ b/src/cdm_reader_mapper/cdm_mapper/reader.py @@ -51,11 +51,10 @@ import pandas as pd -from cdm_reader_mapper.common import get_filename, logging_hdlr +from cdm_reader_mapper.common import get_filename, logging_hdlr, standardize_object_columns from cdm_reader_mapper.core.databundle import DataBundle -from ..properties import SupportedFileTypes -from .properties import cdm_tables +from .properties import SupportedFileTypes, cdm_tables from .utils.conversions import convert_from_str_df, convert_to_str_df from .utils.utilities import get_cdm_subset, get_usecols @@ -439,9 +438,6 @@ def read_tables( elif to_str is True: merged = convert_to_str_df(merged, imodel, cdm_subset=cdm_subset) - string_cols = merged.select_dtypes(include="string").columns - merged[string_cols] = merged[string_cols].astype(object) - object_cols = merged.select_dtypes(include="object").columns - merged[object_cols] = merged[object_cols].fillna(None) + data = standardize_object_columns(merged) - return DataBundle(data=merged, columns=merged.columns, mode="tables") + return DataBundle(data=data, columns=merged.columns, mode="tables") diff --git a/src/cdm_reader_mapper/cdm_mapper/tables/tables.py b/src/cdm_reader_mapper/cdm_mapper/tables/tables.py index 8c58395e..a9a44742 100755 --- a/src/cdm_reader_mapper/cdm_mapper/tables/tables.py +++ b/src/cdm_reader_mapper/cdm_mapper/tables/tables.py @@ -15,14 +15,13 @@ from pathlib import Path from typing import Any -from cdm_reader_mapper.common.json_dict import ( +from cdm_reader_mapper.cdm_mapper import properties +from cdm_reader_mapper.common import ( collect_json_files, combine_dicts, open_json_file, ) -from .. import properties - def get_cdm_atts( cdm_tables: str | tuple[str, str] | Sequence[str | tuple[str, str]] | None = None, diff --git a/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py b/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py index 47cf6eb3..32ade61b 100755 --- a/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py +++ b/src/cdm_reader_mapper/cdm_mapper/utils/conversions.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd -from .. import properties -from ..tables.tables import get_cdm_atts, get_imodel_maps +from cdm_reader_mapper.cdm_mapper import properties +from cdm_reader_mapper.cdm_mapper.tables.tables import get_cdm_atts, get_imodel_maps class BaseConverter: diff --git a/src/cdm_reader_mapper/cdm_mapper/utils/utilities.py b/src/cdm_reader_mapper/cdm_mapper/utils/utilities.py index 6f405cbd..21529665 100755 --- a/src/cdm_reader_mapper/cdm_mapper/utils/utilities.py +++ b/src/cdm_reader_mapper/cdm_mapper/utils/utilities.py @@ -4,7 +4,7 @@ from collections.abc import Iterable from typing import Any -from .. import properties +from cdm_reader_mapper.cdm_mapper import properties def dict_to_tuple_list(dic: dict[Any, Any]) -> list[tuple[Any, Any]]: diff --git a/src/cdm_reader_mapper/cdm_mapper/writer.py b/src/cdm_reader_mapper/cdm_mapper/writer.py index 64288e29..1ece36fc 100755 --- a/src/cdm_reader_mapper/cdm_mapper/writer.py +++ b/src/cdm_reader_mapper/cdm_mapper/writer.py @@ -29,7 +29,7 @@ from cdm_reader_mapper.common import get_filename, logging_hdlr -from ..properties import SupportedFileTypes +from .properties import SupportedFileTypes from .tables.tables import get_cdm_atts from .utils.conversions import convert_from_str_df, convert_to_str_df from .utils.utilities import adjust_filename, dict_to_tuple_list, get_cdm_subset diff --git a/src/cdm_reader_mapper/common/iterators.py b/src/cdm_reader_mapper/common/iterators.py index 753d183c..ff4314eb 100755 --- a/src/cdm_reader_mapper/common/iterators.py +++ b/src/cdm_reader_mapper/common/iterators.py @@ -18,6 +18,8 @@ import pyarrow.parquet as pq import xarray as xr +from .object_types import standardize_object_columns + class ProcessFunction: r""" @@ -558,10 +560,7 @@ def _parquet_generator(temp_dir: TemporaryDirectory[str], data_type: type, schem for f in files: df = pd.read_parquet(f) - string_cols = df.select_dtypes(include="str").columns - df[string_cols] = df[string_cols].astype(object) - object_cols = df.select_dtypes(include="object").columns - df[object_cols] = df[object_cols].fillna(None) + df = standardize_object_columns(df) if data_type is pd.Series: s = df.iloc[:, 0].copy() diff --git a/src/cdm_reader_mapper/core/_utilities.py b/src/cdm_reader_mapper/core/_utilities.py index 7c6dff74..fdffc430 100755 --- a/src/cdm_reader_mapper/core/_utilities.py +++ b/src/cdm_reader_mapper/core/_utilities.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from cdm_reader_mapper.common.iterators import ( +from cdm_reader_mapper.common import ( ParquetStreamReader, is_valid_iterator, parquet_stream_from_iterable, diff --git a/src/cdm_reader_mapper/core/databundle.py b/src/cdm_reader_mapper/core/databundle.py index 4b3f20a8..985724c7 100755 --- a/src/cdm_reader_mapper/core/databundle.py +++ b/src/cdm_reader_mapper/core/databundle.py @@ -8,15 +8,16 @@ from cdm_reader_mapper.cdm_mapper.mapper import map_model from cdm_reader_mapper.common import ( + ParquetStreamReader, count_by_cat, get_length, + is_valid_iterator, replace_columns, split_by_boolean_false, split_by_boolean_true, split_by_column_entries, split_by_index, ) -from cdm_reader_mapper.common.iterators import ParquetStreamReader, is_valid_iterator from cdm_reader_mapper.metmetpy import ( correct_datetime, correct_pt, diff --git a/src/cdm_reader_mapper/core/reader.py b/src/cdm_reader_mapper/core/reader.py index a7d0efa9..d2ca72c6 100755 --- a/src/cdm_reader_mapper/core/reader.py +++ b/src/cdm_reader_mapper/core/reader.py @@ -6,8 +6,8 @@ from cdm_reader_mapper.cdm_mapper.reader import read_tables from cdm_reader_mapper.mdf_reader.reader import read_data, read_mdf +from cdm_reader_mapper.properties import SupportedReadModes -from ..properties import SupportedReadModes from .databundle import DataBundle diff --git a/src/cdm_reader_mapper/core/writer.py b/src/cdm_reader_mapper/core/writer.py index 49dba472..097217e2 100755 --- a/src/cdm_reader_mapper/core/writer.py +++ b/src/cdm_reader_mapper/core/writer.py @@ -8,8 +8,7 @@ from cdm_reader_mapper.cdm_mapper.writer import write_tables from cdm_reader_mapper.mdf_reader.writer import write_data - -from ..properties import SupportedWriteModes +from cdm_reader_mapper.properties import SupportedWriteModes supported_write_modes = get_args(SupportedWriteModes) diff --git a/src/cdm_reader_mapper/mdf_reader/codes/codes.py b/src/cdm_reader_mapper/mdf_reader/codes/codes.py index a8786913..0501f166 100755 --- a/src/cdm_reader_mapper/mdf_reader/codes/codes.py +++ b/src/cdm_reader_mapper/mdf_reader/codes/codes.py @@ -8,13 +8,12 @@ from pathlib import Path from typing import Any -from cdm_reader_mapper.common.json_dict import ( +from cdm_reader_mapper.common import ( collect_json_files, combine_dicts, open_json_file, ) - -from .. import properties +from cdm_reader_mapper.mdf_reader import properties def read_table( diff --git a/src/cdm_reader_mapper/mdf_reader/properties.py b/src/cdm_reader_mapper/mdf_reader/properties.py index 1cb9ade1..a934a368 100755 --- a/src/cdm_reader_mapper/mdf_reader/properties.py +++ b/src/cdm_reader_mapper/mdf_reader/properties.py @@ -3,7 +3,7 @@ from __future__ import annotations from typing import get_args -from ..properties import NumericTypes, ObjectTypes, SupportedDataModels +from cdm_reader_mapper.properties import NumericTypes, ObjectTypes, SupportedDataModels __all__ = [ diff --git a/src/cdm_reader_mapper/mdf_reader/reader.py b/src/cdm_reader_mapper/mdf_reader/reader.py index e1116450..ec494e0f 100755 --- a/src/cdm_reader_mapper/mdf_reader/reader.py +++ b/src/cdm_reader_mapper/mdf_reader/reader.py @@ -8,9 +8,9 @@ import pandas as pd from cdm_reader_mapper import DataBundle +from cdm_reader_mapper.common import open_json_file, standardize_object_columns +from cdm_reader_mapper.properties import SupportedFileTypes -from ..common.json_dict import open_json_file -from ..properties import SupportedFileTypes from .utils.filereader import FileReader from .utils.utilities import as_list, as_path, read_csv, read_feather, read_parquet, validate_arg @@ -302,10 +302,7 @@ def _read_data( **mask_kwargs, ) - string_cols = data.select_dtypes(include="str").columns - data[string_cols] = data[string_cols].astype(object) - object_cols = data.select_dtypes(include="object").columns - data[object_cols] = data[object_cols].fillna(None) + data = standardize_object_columns(data) if "dtypes" in info: info["dtypes"] = info["dtypes"].replace("str", "object") diff --git a/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py b/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py index fd193df9..7cf314e8 100755 --- a/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py +++ b/src/cdm_reader_mapper/mdf_reader/schemas/schemas.py @@ -8,9 +8,8 @@ from pathlib import Path from typing import Any, TypedDict, get_args -from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts - -from .. import properties +from cdm_reader_mapper.common import collect_json_files, combine_dicts +from cdm_reader_mapper.mdf_reader import properties class SectionDict(TypedDict, total=False): diff --git a/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py b/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py index 3873bad9..3f857653 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py @@ -7,7 +7,8 @@ import pandas as pd -from .. import properties +from cdm_reader_mapper.mdf_reader import properties + from .utilities import convert_str_boolean diff --git a/src/cdm_reader_mapper/mdf_reader/utils/filereader.py b/src/cdm_reader_mapper/mdf_reader/utils/filereader.py index 07665ccf..68f5ffc5 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/filereader.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/filereader.py @@ -13,10 +13,10 @@ import pandas as pd import xarray as xr -from cdm_reader_mapper.common.iterators import ProcessFunction, process_function +from cdm_reader_mapper.common import ProcessFunction, process_function, standardize_object_columns from cdm_reader_mapper.core.databundle import DataBundle +from cdm_reader_mapper.mdf_reader import properties -from .. import properties from .convert_and_decode import convert_and_decode from .parser import ( ParserConfig, @@ -279,10 +279,7 @@ def _process_data( for object_column in object_columns: data[object_column] = data[object_column].str.encode(config.encoding).str.decode("utf-8") - string_cols = data.select_dtypes(include="str").columns - data[string_cols] = data[string_cols].astype(object) - object_cols = data.select_dtypes(include="object").columns - data[object_cols] = data[object_cols].fillna(None) + data = standardize_object_columns(data) return data, mask, config diff --git a/src/cdm_reader_mapper/mdf_reader/utils/parser.py b/src/cdm_reader_mapper/mdf_reader/utils/parser.py index 82d5d384..218b6eb8 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/parser.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/parser.py @@ -14,8 +14,9 @@ import pandas as pd import xarray as xr -from .. import properties -from ..schemas.schemas import SchemaDict, read_schema +from cdm_reader_mapper.mdf_reader import properties +from cdm_reader_mapper.mdf_reader.schemas.schemas import SchemaDict, read_schema + from .convert_and_decode import Converters, Decoders from .utilities import convert_dtypes diff --git a/src/cdm_reader_mapper/mdf_reader/utils/utilities.py b/src/cdm_reader_mapper/mdf_reader/utils/utilities.py index ca940189..c8f1c8b3 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/utilities.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/utilities.py @@ -9,7 +9,7 @@ import pandas as pd -from cdm_reader_mapper.common.iterators import ProcessFunction, process_function +from cdm_reader_mapper.common import ProcessFunction, process_function def as_list(x: str | Iterable[Any] | None) -> list[Any] | None: diff --git a/src/cdm_reader_mapper/mdf_reader/utils/validators.py b/src/cdm_reader_mapper/mdf_reader/utils/validators.py index 110674fd..4c1888cc 100755 --- a/src/cdm_reader_mapper/mdf_reader/utils/validators.py +++ b/src/cdm_reader_mapper/mdf_reader/utils/validators.py @@ -8,8 +8,9 @@ import numpy as np import pandas as pd -from .. import properties -from ..codes import codes +from cdm_reader_mapper.mdf_reader import properties +from cdm_reader_mapper.mdf_reader.codes import codes + from .utilities import convert_str_boolean diff --git a/src/cdm_reader_mapper/mdf_reader/writer.py b/src/cdm_reader_mapper/mdf_reader/writer.py index 6193a042..a16209c2 100755 --- a/src/cdm_reader_mapper/mdf_reader/writer.py +++ b/src/cdm_reader_mapper/mdf_reader/writer.py @@ -9,13 +9,14 @@ import pandas as pd -from ..common import get_filename -from ..common.iterators import ( +from cdm_reader_mapper.common import ( ParquetStreamReader, + get_filename, is_valid_iterator, parquet_stream_from_iterable, ) -from ..properties import SupportedFileTypes +from cdm_reader_mapper.properties import SupportedFileTypes + from .utils.utilities import join, update_column_names, update_dtypes diff --git a/src/cdm_reader_mapper/metmetpy/correct.py b/src/cdm_reader_mapper/metmetpy/correct.py index 64bba177..a9e5f37a 100755 --- a/src/cdm_reader_mapper/metmetpy/correct.py +++ b/src/cdm_reader_mapper/metmetpy/correct.py @@ -62,9 +62,8 @@ import pandas as pd -from ..common import logging_hdlr -from ..common.iterators import ProcessFunction, process_function -from ..common.json_dict import collect_json_files, combine_dicts +from cdm_reader_mapper.common import ProcessFunction, collect_json_files, combine_dicts, logging_hdlr, process_function + from . import properties from .datetime import correction_functions as corr_f_dt from .platform_type import correction_functions as corr_f_pt diff --git a/src/cdm_reader_mapper/metmetpy/datetime/correction_functions.py b/src/cdm_reader_mapper/metmetpy/datetime/correction_functions.py index 75ccdfc6..096893bb 100755 --- a/src/cdm_reader_mapper/metmetpy/datetime/correction_functions.py +++ b/src/cdm_reader_mapper/metmetpy/datetime/correction_functions.py @@ -10,7 +10,8 @@ import pandas as pd -from .. import properties +from cdm_reader_mapper.metmetpy import properties + from . import model_datetimes diff --git a/src/cdm_reader_mapper/metmetpy/datetime/model_datetimes.py b/src/cdm_reader_mapper/metmetpy/datetime/model_datetimes.py index 98db90ac..69224682 100755 --- a/src/cdm_reader_mapper/metmetpy/datetime/model_datetimes.py +++ b/src/cdm_reader_mapper/metmetpy/datetime/model_datetimes.py @@ -18,7 +18,7 @@ import numpy as np import pandas as pd -from .. import properties +from cdm_reader_mapper.metmetpy import properties def datetime_decimalhour_to_hm(decimal_hours: float) -> tuple[int, int]: diff --git a/src/cdm_reader_mapper/metmetpy/platform_type/correction_functions.py b/src/cdm_reader_mapper/metmetpy/platform_type/correction_functions.py index 32d0bd19..1137a2ba 100755 --- a/src/cdm_reader_mapper/metmetpy/platform_type/correction_functions.py +++ b/src/cdm_reader_mapper/metmetpy/platform_type/correction_functions.py @@ -12,7 +12,7 @@ import pandas as pd -from .. import properties +from cdm_reader_mapper.metmetpy import properties def is_num(x: Any) -> bool: diff --git a/src/cdm_reader_mapper/metmetpy/validate.py b/src/cdm_reader_mapper/metmetpy/validate.py index 16bc95f0..af1fc724 100755 --- a/src/cdm_reader_mapper/metmetpy/validate.py +++ b/src/cdm_reader_mapper/metmetpy/validate.py @@ -63,9 +63,8 @@ import pandas as pd -from ..common import logging_hdlr -from ..common.iterators import ProcessFunction, process_function -from ..common.json_dict import collect_json_files, combine_dicts +from cdm_reader_mapper.common import ProcessFunction, collect_json_files, combine_dicts, logging_hdlr, process_function + from . import properties from .datetime import model_datetimes diff --git a/tests/test_common_utils.py b/tests/test_common_utils.py index 1f5d5186..8d059279 100755 --- a/tests/test_common_utils.py +++ b/tests/test_common_utils.py @@ -8,6 +8,8 @@ from pathlib import Path from urllib.parse import urlparse +import numpy as np +import pandas as pd import pytest import requests @@ -28,6 +30,7 @@ open_json_file, ) from cdm_reader_mapper.common.logging_hdlr import init_logger +from cdm_reader_mapper.common.object_types import standardize_object_columns def compute_md5(content: bytes) -> str: @@ -481,3 +484,27 @@ def test_get_path_missing_file_module_not_found(tmp_path, caplog): assert result is None assert any("No module named" in msg for msg in caplog.messages) + + +def test_standardize_object_columns(): + df = pd.DataFrame( + { + "A": pd.Series(["x", "y", None], dtype="string"), + "B": pd.Series([1.1, 2.2, 3.3]), + "C": pd.Series([1, 2, 3]), + "D": pd.Series([True, False, np.nan]), + } + ) + + result = standardize_object_columns(df) + + expected = pd.DataFrame( + { + "A": pd.Series(["x", "y", None], dtype=object), + "B": pd.Series([1.1, 2.2, 3.3]), + "C": pd.Series([1, 2, 3]), + "D": pd.Series([True, False, None], dtype=object), + } + ) + + pd.testing.assert_frame_equal(result, expected) From 10b6aea3be1b6184791a9077068f274a4ce134d4 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 25 Jun 2026 11:28:48 +0200 Subject: [PATCH 08/13] new helper function to standardize object types in pandas DataFrames --- src/cdm_reader_mapper/common/object_types.py | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 src/cdm_reader_mapper/common/object_types.py diff --git a/src/cdm_reader_mapper/common/object_types.py b/src/cdm_reader_mapper/common/object_types.py new file mode 100755 index 00000000..1c1c23b9 --- /dev/null +++ b/src/cdm_reader_mapper/common/object_types.py @@ -0,0 +1,28 @@ +"""Utility function for reading and writing files.""" + +from __future__ import annotations +from collections.abc import Sequence +from pathlib import Path + +import pandas as pd + +def standardize_object_columns(df: pd.DataFrame) -> pd.DataFrame: + """ + Convert string columns to object dtype and replace NaNs with None. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame to be standardized. + + Returns + ------- + pd.DataFrame + The same DataFrame instance after the dtype conversion and NaN handling. + """ + df = df.copy() + string_cols = df.select_dtypes(include="string").columns + df[string_cols] = df[string_cols].astype(object) + object_cols = df.select_dtypes(include="object").columns + df[object_cols] = df[object_cols].fillna(None) + return df From 852eafa9d2f20fa9aa9bcf69568facaeab132b03 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 25 Jun 2026 11:29:35 +0200 Subject: [PATCH 09/13] make some JSON and iterator utility functions directly importable in submodule cdm_reader_mapper.common --- src/cdm_reader_mapper/common/__init__.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/cdm_reader_mapper/common/__init__.py b/src/cdm_reader_mapper/common/__init__.py index 22a68549..ff175e2f 100755 --- a/src/cdm_reader_mapper/common/__init__.py +++ b/src/cdm_reader_mapper/common/__init__.py @@ -2,10 +2,12 @@ from __future__ import annotations -from . import json_dict from .getting_files import load_file from .inspect import count_by_cat, get_length from .io_files import get_filename +from .iterators import ParquetStreamReader, ProcessFunction, is_valid_iterator, parquet_stream_from_iterable, process_disk_backed, process_function +from .json_dict import collect_json_files, combine_dicts, open_json_file +from .object_types import standardize_object_columns from .replace import replace_columns from .select import ( split_by_boolean, @@ -17,15 +19,24 @@ __all__ = [ + "ParquetStreamReader", + "ProcessFunction", + "collect_json_files", + "combine_dicts", "count_by_cat", "get_filename", "get_length", - "json_dict", + "is_valid_iterator", "load_file", + "open_json_file", + "parquet_stream_from_iterable", + "process_disk_backed", + "process_function", "replace_columns", "split_by_boolean", "split_by_boolean_false", "split_by_boolean_true", "split_by_column_entries", "split_by_index", + "standardize_object_columns", ] From 6284a45f36685b0789865e79211939be7269f064 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 25 Jun 2026 11:29:50 +0200 Subject: [PATCH 10/13] update CHANGELOG II --- CHANGELOG.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3a337a1d..698c0cee 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -54,6 +54,8 @@ Internal changes * remove helper class `core._utilities._DataBundle` and integrate it in `core.databundle.DataBundle` (:pull:`419`) * make use of `pathlib.Path` instead of `os.path` (:pull:`419`) * use consistently parameter "imodel" instead of "data_model" and "correction_method" instead of "fix_method" in `metmetpy` modules (:pull:`419`) +* make some JSON and iterator utility functions directly importable in submodule ``cdm_reader_mapper.common`` (:pull:`435`) +* replace relative imports with absolute imports for any submodules located one or more directories above the current submodule (:pull:`435`) 2.4.1 (2016-04-16) From 2f3c34cfd61eb2520fde3e66e46a6fcb110f8957 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Jun 2026 09:31:17 +0000 Subject: [PATCH 11/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/cdm_reader_mapper/common/object_types.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/cdm_reader_mapper/common/object_types.py b/src/cdm_reader_mapper/common/object_types.py index 1c1c23b9..afe4e18e 100755 --- a/src/cdm_reader_mapper/common/object_types.py +++ b/src/cdm_reader_mapper/common/object_types.py @@ -1,20 +1,19 @@ """Utility function for reading and writing files.""" from __future__ import annotations -from collections.abc import Sequence -from pathlib import Path import pandas as pd + def standardize_object_columns(df: pd.DataFrame) -> pd.DataFrame: """ Convert string columns to object dtype and replace NaNs with None. - + Parameters ---------- df : pd.DataFrame The input DataFrame to be standardized. - + Returns ------- pd.DataFrame From 6b1991f4e8d64164f81f0674bf160bd57d29b25d Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 25 Jun 2026 11:44:59 +0200 Subject: [PATCH 12/13] fixing some ruff errors --- src/cdm_reader_mapper/common/object_types.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/cdm_reader_mapper/common/object_types.py b/src/cdm_reader_mapper/common/object_types.py index 1c1c23b9..afe4e18e 100755 --- a/src/cdm_reader_mapper/common/object_types.py +++ b/src/cdm_reader_mapper/common/object_types.py @@ -1,20 +1,19 @@ """Utility function for reading and writing files.""" from __future__ import annotations -from collections.abc import Sequence -from pathlib import Path import pandas as pd + def standardize_object_columns(df: pd.DataFrame) -> pd.DataFrame: """ Convert string columns to object dtype and replace NaNs with None. - + Parameters ---------- df : pd.DataFrame The input DataFrame to be standardized. - + Returns ------- pd.DataFrame From 0e1caf40558d07af73dde7c5647e5ffd3a9666f0 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 25 Jun 2026 12:51:16 +0200 Subject: [PATCH 13/13] set interla module of ParquetStreamReader to cdm_reader_mapper.common --- src/cdm_reader_mapper/common/iterators.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/cdm_reader_mapper/common/iterators.py b/src/cdm_reader_mapper/common/iterators.py index ff4314eb..94e97bb2 100755 --- a/src/cdm_reader_mapper/common/iterators.py +++ b/src/cdm_reader_mapper/common/iterators.py @@ -422,6 +422,9 @@ def __exit__(self, _exc_type: type | None, _exc_val: BaseException | None, _exc_ self.close() +ParquetStreamReader.__module__ = "cdm_reader_mapper.common" + + def _sort_chunk_outputs( outputs: tuple[Any, ...], capture_meta: bool, requested_types: tuple[type, ...] ) -> tuple[list[pd.DataFrame | pd.Series], list[Any]]: