From c502051cd7f447144aeb70e2e247bc43cecc43eb Mon Sep 17 00:00:00 2001 From: NeurArk Date: Mon, 19 May 2025 20:27:52 +0200 Subject: [PATCH] Improve data cleaning and reporting --- README.md | 12 ++++++ datamorpher/cleaner.py | 88 ++++++++++++++++++++++++++++++++++++----- datamorpher/reporter.py | 12 ++++++ tests/test_cleaner.py | 14 ++++++- tests/test_reporter.py | 21 ++++++++++ 5 files changed, 137 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 49df249..64a1287 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,8 @@ When enabled, DataMorpher performs: - Date columns: Left as missing - Boolean columns: Mode - Invalid values reported for review +- **Smart Parsing**: Extracts numbers from corrupted strings (e.g. `"8000foo0" -> 8000`) +- **Date Formats**: Supports `%Y-%m-%d`, `%d/%m/%Y`, `%m/%d/%Y`, and `%Y/%m/%d` ## Conversion Report @@ -105,6 +107,7 @@ Each conversion generates a Markdown report containing: - Values imputed per column - Detected data types - Total execution time +- Detailed transformations of cleaned values Example report snippet: ```markdown @@ -124,6 +127,15 @@ Example report snippet: - Status: categorical ``` +The report also lists transformations applied, for example: + +```markdown +## Transformations appliquées +### Colonne 'salary' +- "8000foo0" -> 8000 (nombre extrait) +- NaN -> 76166.67 (mean) +``` + ## Development ### Running Tests diff --git a/datamorpher/cleaner.py b/datamorpher/cleaner.py index f8161d5..c721f2f 100644 --- a/datamorpher/cleaner.py +++ b/datamorpher/cleaner.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Dict +from typing import Dict, Sequence import pandas as pd from pandas.api.types import infer_dtype @@ -10,12 +10,19 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]: """Clean a DataFrame and report actions taken.""" - info: Dict[str, object] = {"duplicates": 0, "imputed": {}, "invalid": {}} + info: Dict[str, object] = { + "duplicates": 0, + "imputed": {}, + "invalid": {}, + "transformations": {}, + } before = len(df) df = df.drop_duplicates() info["duplicates"] = before - len(df) + transformations: Dict[str, list[str]] = info["transformations"] + # Make a copy to avoid SettingWithCopyWarning df = df.copy() @@ -23,14 +30,18 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]: dtype = infer_dtype(df[col], skipna=True) series = df[col] if dtype in {"string", "mixed"}: - validated, invalid = _validate_numeric(series) + validated, invalid = _validate_numeric( + series, col, transformations + ) if validated is not None: df[col] = validated if invalid: info["invalid"][col] = invalid dtype = "floating" else: - validated, invalid = _validate_dates(series) + validated, invalid = _validate_dates( + series, column=col, transformations=transformations + ) if validated is not None: df[col] = validated if invalid: @@ -39,13 +50,21 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]: if df[col].isna().any(): if dtype in {"integer", "floating"}: - df[col] = df[col].fillna(df[col].mean()) + value = df[col].mean() + df[col] = df[col].fillna(value) info["imputed"][col] = "mean" + transformations.setdefault(col, []).append( + f"NaN -> {value:.2f} (mean)" + ) elif dtype in {"string", "categorical", "boolean"}: mode = df[col].mode(dropna=True) if not mode.empty: - df[col] = df[col].fillna(mode.iloc[0]) + value = mode.iloc[0] + df[col] = df[col].fillna(value) info["imputed"][col] = "mode" + transformations.setdefault(col, []).append( + f"NaN -> {value} (mode)" + ) return df, info @@ -97,22 +116,73 @@ def _words_to_num(text: str | None) -> float | None: return float(value) -def _validate_numeric(series: pd.Series) -> tuple[pd.Series | None, int]: +def _validate_numeric( + series: pd.Series, + column: str | None = None, + transformations: Dict[str, list[str]] | None = None, +) -> tuple[pd.Series | None, int]: """Return numeric series if convertible and count invalid entries.""" converted = pd.to_numeric(series, errors="coerce") + if converted.isna().any(): as_words = series.where(converted.isna()).apply(_words_to_num) + if transformations is not None and column is not None: + for idx, val in as_words.dropna().items(): + transformations.setdefault(column, []).append( + f"{series[idx]} -> {val}" + ) converted.update(as_words) + + if converted.isna().any(): + mask = converted.isna() + extracted = ( + series.where(mask) + .where(~series.str.contains(r"[/-]", na=False)) + .str.extract(r"(\d+\.?\d*)")[0] + ) + extracted_numeric = pd.to_numeric(extracted, errors="coerce") + if transformations is not None and column is not None: + for idx, val in extracted_numeric.dropna().items(): + transformations.setdefault(column, []).append( + f"{series[idx]} -> {val}" + ) + converted.update(extracted_numeric) + if converted.notna().sum() == 0: return None, 0 + invalid = int((converted.isna() & series.notna()).sum()) return converted, invalid -def _validate_dates(series: pd.Series) -> tuple[pd.Series | None, int]: +def _validate_dates( + series: pd.Series, + formats: Sequence[str] | None = None, + *, + column: str | None = None, + transformations: Dict[str, list[str]] | None = None, +) -> tuple[pd.Series | None, int]: """Return series of ISO formatted dates if convertible.""" - parsed = pd.to_datetime(series, errors="coerce") + if formats is None: + formats = ["%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y/%m/%d"] + + parsed = pd.Series(pd.NaT, index=series.index) + for fmt in formats: + parsed_try = pd.to_datetime(series, errors="coerce", format=fmt) + parsed = parsed.fillna(parsed_try) + if parsed.notna().sum() == 0: return None, 0 + + if transformations is not None and column is not None: + for idx in series.index: + if pd.isna(parsed[idx]) or pd.isna(series[idx]): + continue + formatted = parsed[idx].strftime("%Y-%m-%d") + if str(series[idx]) != formatted: + transformations.setdefault(column, []).append( + f"{series[idx]} -> {formatted}" + ) + invalid = int(parsed.isna().sum()) return parsed.dt.strftime("%Y-%m-%d"), invalid diff --git a/datamorpher/reporter.py b/datamorpher/reporter.py index 05a28ca..cb67b2f 100644 --- a/datamorpher/reporter.py +++ b/datamorpher/reporter.py @@ -27,6 +27,8 @@ def build_report( "\n".join(f"- {c}: {m}" for c, m in clean_info["imputed"].items()) or "None" ) + transformations = clean_info.get("transformations", {}) + invalid = clean_info.get("invalid", {}) report = f"""# DataMorpher Report ## Summary @@ -39,4 +41,14 @@ def build_report( ## Column Types {table} """ + if transformations: + report += "\n\n## Transformations appliquées\n" + for col, changes in transformations.items(): + report += f"### Colonne '{col}'\n" + for change in changes: + report += f"- {change}\n" + if invalid.get(col): + report += ( + f"- {invalid[col]} valeurs invalides non récupérables\n" + ) return report diff --git a/tests/test_cleaner.py b/tests/test_cleaner.py index 16510d6..10e89ab 100644 --- a/tests/test_cleaner.py +++ b/tests/test_cleaner.py @@ -21,6 +21,18 @@ def test_numeric_and_date_validation() -> None: ) cleaned, info = clean_data(df) - assert info["invalid"] == {"num": 1, "date": 1} + assert info["invalid"] == {"date": 1} assert cleaned.loc[1, "num"] == 28 + assert cleaned.loc[2, "num"] == 8000 assert cleaned["date"].isna().sum() == 1 + + +def test_date_format_variants() -> None: + df = pd.DataFrame( + { + "date": ["2020-01-02", "03/04/2021", "05/06/2022", "2020/07/08"], + } + ) + cleaned, _ = clean_data(df) + expected = ["2020-01-02", "2021-04-03", "2022-06-05", "2020-07-08"] + assert list(cleaned["date"]) == expected diff --git a/tests/test_reporter.py b/tests/test_reporter.py index 5bf7f27..ebccf72 100644 --- a/tests/test_reporter.py +++ b/tests/test_reporter.py @@ -17,3 +17,24 @@ def test_report_contains_summary(tmp_path: Path) -> None: assert "DataMorpher Report" in report assert "in.csv" in report assert "out.csv" in report + + +def test_report_includes_transformations(tmp_path: Path) -> None: + types = {"a": "integer"} + clean_info = { + "duplicates": 0, + "imputed": {"a": "mean"}, + "invalid": {}, + "transformations": {"a": ["1 -> 2"]}, + } + report = build_report( + Path("in.csv"), + tmp_path / "out.csv", + 1, + 1, + clean_info, + types, + 0.1, + ) + assert "Transformations appliquées" in report + assert "1 -> 2" in report